diff --git a/build.xml b/build.xml
index 85955d774..ef53f6aa4 100644
--- a/build.xml
+++ b/build.xml
@@ -49,7 +49,7 @@
-
+
@@ -489,7 +489,7 @@
docletpathref="doclet.classpath"
classpathref="external.dependencies"
classpath="${java.classes}"
- additionalparam="-private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet -J-Xdebug -J-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005">
+ additionalparam="-private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet -J-Xdebug -J-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005">
diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
index b9e380295..2ff8aa979 100755
--- a/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
+++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/AnalyzeCovariates.java
@@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.*;
@@ -42,19 +43,71 @@ import java.util.Map;
import java.util.regex.Pattern;
/**
- * Created by IntelliJ IDEA.
- * User: rpoplin
- * Date: Dec 1, 2009
+ * Call R scripts to plot residual error versus the various covariates.
*
- * Create collapsed versions of the recal csv file and call R scripts to plot residual error versus the various covariates.
+ *
+ * After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which
+ * reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically
+ * show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities).
+ * In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run
+ * CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated
+ * by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual
+ * error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated
+ * bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration.
+ *
+ *
+ * The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into
+ * each of the quality score estimates. It is defined as follows for N, the number of observations:
+ *
+ *
+ * - light blue means N < 1,000
+ * - cornflower blue means 1,000 <= N < 10,000
+ * - dark blue means N >= 10,000
+ * - The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically
+ * meaningless and so aren't included in any of the numerical calculations.
+ *
+ *
+ *
+ * NOTE: For those running this tool externally from the Broad, it is crucial to note that both the -Rscript and -resources options
+ * must be changed from the default. -Rscript needs to point to your installation of Rscript (this is the scripting version of R,
+ * not the interactive version) while -resources needs to point to the folder holding the R scripts that are used. For those using
+ * this tool as part of the Binary Distribution the -resources should point to the resources folder that is part of the tarball.
+ * For those using this tool by building from the git repository the -resources should point to the R/ subdirectory of the Sting checkout.
+ *
+ *
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots.
+ * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
+ *
+ *
Input
+ *
+ * The recalibration table file in CSV format that was generated by the CountCovariates walker.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx4g -jar AnalyzeCovariates.jar \
+ * -recalFile /path/to/recal.table.csv \
+ * -outputDir /path/to/output_dir/ \
+ * -resources resources/ \
+ * -ignoreQ 5
+ *
+ *
*/
+@DocumentedGATKFeature(
+ groupName = "AnalyzeCovariates",
+ summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
public class AnalyzeCovariates extends CommandLineProgram {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
-
+ /**
+ * After the header, data records occur one per line until the end of the file. The first several items on a line are the
+ * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
+ * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
+ * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+ */
@Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false)
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
@@ -67,11 +120,20 @@ public class AnalyzeCovariates extends CommandLineProgram {
private int IGNORE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
+
+ /**
+ * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
+ * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
+ */
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 50")
private int MAX_QUALITY_SCORE = 50;
+
+ /**
+ * This argument is useful for comparing before/after plots and you want the axes to match each other.
+ */
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
private int MAX_HISTOGRAM_VALUE = 0;
- @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, this value will be the max value of the histogram plots")
+ @Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting")
private boolean DO_INDEL_QUALITY = false;
diff --git a/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java b/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java
new file mode 100644
index 000000000..9350e4a66
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/analyzecovariates/package-info.java
@@ -0,0 +1,4 @@
+/**
+ * Package to plot residual accuracy versus error covariates for the base quality score recalibrator.
+ */
+package org.broadinstitute.sting.analyzecovariates;
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/commandline/Advanced.java b/public/java/src/org/broadinstitute/sting/commandline/Advanced.java
new file mode 100644
index 000000000..7aeefe261
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/Advanced.java
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.commandline;
+
+import java.lang.annotation.*;
+
+/**
+ * Indicates that a walker argument should is considered an advanced option.
+ *
+ * @author Mark DePristo
+ * @version 0.1
+ */
+@Documented
+@Inherited
+@Retention(RetentionPolicy.RUNTIME)
+@Target({ElementType.TYPE,ElementType.FIELD})
+public @interface Advanced {
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java
index e0e2ac378..8ec0d650a 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentSource.java
@@ -151,6 +151,14 @@ public class ArgumentSource {
return field.isAnnotationPresent(Hidden.class) || field.isAnnotationPresent(Deprecated.class);
}
+ /**
+ * Is the given argument considered an advanced option when displaying on the command-line argument system.
+ * @return True if so. False otherwise.
+ */
+ public boolean isAdvanced() {
+ return field.isAnnotationPresent(Advanced.class);
+ }
+
/**
* Is this command-line argument dependent on some primitive argument types?
* @return True if this command-line argument depends on other arguments; false otherwise.
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index d1d4ff914..ff992d77d 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -325,7 +325,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
@Override
public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) {
- Class parameterType = getParameterizedTypeClass(type);
+ Class parameterType = JVMUtils.getParameterizedTypeClass(type);
return RodBinding.makeUnbound((Class extends Feature>)parameterType);
}
@@ -338,6 +338,8 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) {
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
String value = getArgumentValue( defaultDefinition, matches );
+ Class extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type);
+
try {
String name = defaultDefinition.fullName;
String tribbleType = null;
@@ -372,19 +374,19 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file);
if ( featureDescriptor != null ) {
tribbleType = featureDescriptor.getName();
- logger.warn("Dynamically determined type of " + file + " to be " + tribbleType);
+ logger.info("Dynamically determined type of " + file + " to be " + tribbleType);
}
}
+
+ if ( tribbleType == null )
+ throw new UserException.CommandLineException(
+ String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
+ "Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
+ manager.userFriendlyListOfAvailableFeatures(parameterType)));
}
}
- if ( tribbleType == null ) // error handling
- throw new UserException.CommandLineException(
- String.format("Could not parse argument %s with value %s",
- defaultDefinition.fullName, value));
-
Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
- Class parameterType = getParameterizedTypeClass(type);
RodBinding result = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags);
parsingEngine.addTags(result,tags);
parsingEngine.addRodBinding(result);
@@ -395,20 +397,10 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
value, source.field.getName()));
} catch (Exception e) {
throw new UserException.CommandLineException(
- String.format("Failed to parse value %s for argument %s.",
- value, source.field.getName()));
+ String.format("Failed to parse value %s for argument %s. Message: %s",
+ value, source.field.getName(), e.getMessage()));
}
}
-
- private Class getParameterizedTypeClass(Type t) {
- if ( t instanceof ParameterizedType ) {
- ParameterizedType parameterizedType = (ParameterizedType)t;
- if ( parameterizedType.getActualTypeArguments().length != 1 )
- throw new ReviewedStingException("BUG: more than 1 generic type found on class" + t);
- return (Class)parameterizedType.getActualTypeArguments()[0];
- } else
- throw new ReviewedStingException("BUG: could not find generic type on class " + t);
- }
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/commandline/Output.java b/public/java/src/org/broadinstitute/sting/commandline/Output.java
index 22565dbf5..f8aef0355 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/Output.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/Output.java
@@ -55,7 +55,7 @@ public @interface Output {
* --help argument is specified.
* @return Doc string associated with this command-line argument.
*/
- String doc() default "An output file presented to the walker. Will overwrite contents if file exists.";
+ String doc() default "An output file created by the walker. Will overwrite contents if file exists";
/**
* Is this argument required. If true, the command-line argument system will
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
index 32132c7ca..32002e093 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java
@@ -96,24 +96,23 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
loadArgumentsIntoObject(walker);
argumentSources.add(walker);
- Collection newStyle = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
+ Collection rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
// todo: remove me when the old style system is removed
if ( getArgumentCollection().RODBindings.size() > 0 ) {
logger.warn("################################################################################");
logger.warn("################################################################################");
- logger.warn("Deprecated -B rod binding syntax detected. This syntax will be retired in GATK 1.2.");
+ logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2.");
logger.warn("Please use arguments defined by each specific walker instead.");
for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) {
logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags());
}
logger.warn("################################################################################");
logger.warn("################################################################################");
+ System.exit(1);
}
- Collection oldStyle = ListFileUtils.unpackRODBindingsOldStyle(getArgumentCollection().RODBindings, parser);
- oldStyle.addAll(newStyle);
- engine.setReferenceMetaDataFiles(oldStyle);
+ engine.setReferenceMetaDataFiles(rodBindings);
for (ReadFilter filter: filters) {
loadArgumentsIntoObject(filter);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index 7e96b609e..b8488dc9a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -31,13 +31,11 @@ import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
+import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
import org.broadinstitute.sting.gatk.walkers.Attribution;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.help.ApplicationDetails;
-import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.help.GATKDocUtils;
-import org.broadinstitute.sting.utils.help.GATKDoclet;
+import org.broadinstitute.sting.utils.help.*;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import java.util.*;
@@ -52,7 +50,7 @@ import java.util.*;
@DocumentedGATKFeature(
groupName = "GATK Engine",
summary = "Features and arguments for the GATK engine itself, available to all walkers.",
- extraDocs = { ReadFilter.class, UserException.class })
+ extraDocs = { UserException.class })
public class CommandLineGATK extends CommandLineExecutable {
@Argument(fullName = "analysis_type", shortName = "T", doc = "Type of analysis to run")
private String analysisName = null;
@@ -177,6 +175,10 @@ public class CommandLineGATK extends CommandLineExecutable {
StringBuilder additionalHelp = new StringBuilder();
Formatter formatter = new Formatter(additionalHelp);
+ formatter.format("Available Reference Ordered Data types:%n");
+ formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures());
+ formatter.format("%n");
+
formatter.format("For a full description of this walker, see its GATKdocs at:%n");
formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType));
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index b0c4e203b..5b9ebd99b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -689,8 +689,6 @@ public class GenomeAnalysisEngine {
validateSuppliedReads();
readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference());
- sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles);
-
for (ReadFilter filter : filters)
filter.initialize(this);
@@ -963,7 +961,7 @@ public class GenomeAnalysisEngine {
/**
* Get the list of intervals passed to the engine.
- * @return List of intervals.
+ * @return List of intervals, or null if no intervals are in use
*/
public GenomeLocSortedSet getIntervals() {
return this.intervals;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java
index ce638ff2b..2f4dd06e2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/DbsnpArgumentCollection.java
@@ -39,8 +39,7 @@ import org.simpleframework.xml.*;
public class DbsnpArgumentCollection {
/**
- * A dbSNP VCF file. Variants in this track will be treated as "known" variants
- * in tools using this track.
+ * A dbSNP VCF file.
*/
@Input(fullName="dbsnp", shortName = "D", doc="dbSNP file", required=false)
public RodBinding dbsnp;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index 62135f21b..fd39d46b0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -101,6 +101,8 @@ public class GATKArgumentCollection {
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
+ @Deprecated
+ @Hidden
@ElementList(required = false)
@Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :, ", required = false)
public ArrayList RODBindings = new ArrayList();
@@ -340,14 +342,6 @@ public class GATKArgumentCollection {
return false;
}
}
- if (other.RODBindings.size() != RODBindings.size()) {
- return false;
- }
- for (int x = 0; x < RODBindings.size(); x++) {
- if (!RODBindings.get(x).equals(other.RODBindings.get(x))) {
- return false;
- }
- }
if (!other.samFiles.equals(this.samFiles)) {
return false;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java
similarity index 95%
rename from public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java
rename to public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java
index cd77a9e7e..4ec451567 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckReadFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FailsVendorQualityCheckFilter.java
@@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord;
* Filter out FailsVendorQualityCheck reads.
*/
-public class FailsVendorQualityCheckReadFilter extends ReadFilter {
+public class FailsVendorQualityCheckFilter extends ReadFilter {
public boolean filterOut( final SAMRecord read ) {
return read.getReadFailsVendorQualityCheckFlag();
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java
similarity index 96%
rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java
rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java
index 75369b306..ed9c37dca 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityReadFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityFilter.java
@@ -35,7 +35,7 @@ import org.broadinstitute.sting.commandline.Argument;
* @version 0.1
*/
-public class MappingQualityReadFilter extends ReadFilter {
+public class MappingQualityFilter extends ReadFilter {
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
public int MIN_MAPPING_QUALTY_SCORE = 10;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java
similarity index 95%
rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java
rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java
index 1afec36d1..ccdb40d31 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableReadFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityUnavailableFilter.java
@@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
* @version 0.1
*/
-public class MappingQualityUnavailableReadFilter extends ReadFilter {
+public class MappingQualityUnavailableFilter extends ReadFilter {
public boolean filterOut(SAMRecord rec) {
return (rec.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java
similarity index 95%
rename from public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java
rename to public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java
index e49d4117c..57db8419c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroReadFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/MappingQualityZeroFilter.java
@@ -33,7 +33,7 @@ import net.sf.samtools.SAMRecord;
* @version 0.1
*/
-public class MappingQualityZeroReadFilter extends ReadFilter {
+public class MappingQualityZeroFilter extends ReadFilter {
public boolean filterOut(SAMRecord rec) {
return (rec.getMappingQuality() == 0);
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java
similarity index 95%
rename from public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java
rename to public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java
index 31c2144ce..50cd30f71 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentReadFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/NotPrimaryAlignmentFilter.java
@@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord;
* Filter out duplicate reads.
*/
-public class NotPrimaryAlignmentReadFilter extends ReadFilter {
+public class NotPrimaryAlignmentFilter extends ReadFilter {
public boolean filterOut( final SAMRecord read ) {
return read.getNotPrimaryAlignmentFlag();
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java
index b9aaf47de..286e22369 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java
@@ -333,10 +333,6 @@ public class RefMetaDataTracker {
return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false);
}
@Deprecated
- public List getValues(final Class type, final Collection names, final GenomeLoc onlyAtThisLoc) {
- return addValues(names, type, new ArrayList(), onlyAtThisLoc, true, false);
- }
- @Deprecated
public T getFirstValue(final Class type, final String name) {
return safeGetFirst(getValues(type, name));
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
index 902f9d308..bf490e28c 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java
@@ -1,10 +1,11 @@
package org.broadinstitute.sting.gatk.refdata;
+import net.sf.samtools.util.SequenceUtil;
import org.broad.tribble.Feature;
+import org.broad.tribble.annotation.Strand;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.gelitext.GeliTextFeature;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
@@ -92,6 +93,67 @@ public class VariantContextAdaptors {
// --------------------------------------------------------------------------------------------------------------
private static class DBSnpAdaptor implements VCAdaptor {
+ private static boolean isSNP(DbSNPFeature feature) {
+ return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact");
+ }
+
+ private static boolean isMNP(DbSNPFeature feature) {
+ return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range");
+ }
+
+ private static boolean isInsertion(DbSNPFeature feature) {
+ return feature.getVariantType().contains("insertion");
+ }
+
+ private static boolean isDeletion(DbSNPFeature feature) {
+ return feature.getVariantType().contains("deletion");
+ }
+
+ private static boolean isIndel(DbSNPFeature feature) {
+ return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature);
+ }
+
+ public static boolean isComplexIndel(DbSNPFeature feature) {
+ return feature.getVariantType().contains("in-del");
+ }
+
+ /**
+ * gets the alternate alleles. This method should return all the alleles present at the location,
+ * NOT including the reference base. This is returned as a string list with no guarantee ordering
+ * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
+ * frequency).
+ *
+ * @return an alternate allele list
+ */
+ public static List getAlternateAlleleList(DbSNPFeature feature) {
+ List ret = new ArrayList();
+ for (String allele : getAlleleList(feature))
+ if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele);
+ return ret;
+ }
+
+ /**
+ * gets the alleles. This method should return all the alleles present at the location,
+ * including the reference base. The first allele should always be the reference allele, followed
+ * by an unordered list of alternate alleles.
+ *
+ * @return an alternate allele list
+ */
+ public static List getAlleleList(DbSNPFeature feature) {
+ List alleleList = new ArrayList();
+ // add ref first
+ if ( feature.getStrand() == Strand.POSITIVE )
+ alleleList = Arrays.asList(feature.getObserved());
+ else
+ for (String str : feature.getObserved())
+ alleleList.add(SequenceUtil.reverseComplement(str));
+ if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase())
+ && !alleleList.get(0).equals(feature.getNCBIRefBase()) )
+ Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0);
+
+ return alleleList;
+ }
+
/**
* Converts non-VCF formatted dbSNP records to VariantContext.
* @return DbSNPFeature.
@@ -102,18 +164,18 @@ public class VariantContextAdaptors {
@Override
public VariantContext convert(String name, Object input, ReferenceContext ref) {
DbSNPFeature dbsnp = (DbSNPFeature)input;
- if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp)) )
+ if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) )
return null;
- Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true);
+ Allele refAllele = Allele.create(dbsnp.getNCBIRefBase(), true);
- if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || DbSNPHelper.isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
+ if ( isSNP(dbsnp) || isIndel(dbsnp) || isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
// add the reference allele
List alleles = new ArrayList();
alleles.add(refAllele);
// add all of the alt alleles
boolean sawNullAllele = refAllele.isNull();
- for ( String alt : DbSNPHelper.getAlternateAlleleList(dbsnp) ) {
+ for ( String alt : getAlternateAlleleList(dbsnp) ) {
if ( ! Allele.acceptableAlleleBases(alt) ) {
//System.out.printf("Excluding dbsnp record %s%n", dbsnp);
return null;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java
deleted file mode 100644
index a2132cee5..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/features/DbSNPHelper.java
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2011, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.refdata.features;
-
-import net.sf.samtools.util.SequenceUtil;
-import org.broad.tribble.annotation.Strand;
-import org.broad.tribble.dbsnp.DbSNPFeature;
-import org.broadinstitute.sting.utils.Utils;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * this class contains static helper methods for DbSNP
- */
-public class DbSNPHelper {
-
- private DbSNPHelper() {} // don't make a DbSNPHelper
-
- public static String rsIDOfFirstRealVariant(List VCs, VariantContext.Type type) {
- if ( VCs == null )
- return null;
-
- String rsID = null;
- for ( VariantContext vc : VCs ) {
- if ( vc.getType() == type ) {
- rsID = vc.getID();
- break;
- }
- }
-
- return rsID;
- }
-
- /**
- * get the -1 * (log 10 of the error value)
- *
- * @return the log based error estimate
- */
- public static double getNegLog10PError(DbSNPFeature feature) {
- return 4; // -log10(0.0001)
- }
-
- //
- // What kind of variant are we?
- //
- // ----------------------------------------------------------------------
- public static boolean isSNP(DbSNPFeature feature) {
- return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact");
- }
-
- public static boolean isMNP(DbSNPFeature feature) {
- return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range");
- }
-
- public static String toMediumString(DbSNPFeature feature) {
- String s = String.format("%s:%d:%s:%s", feature.getChr(), feature.getStart(), feature.getRsID(), Utils.join("",feature.getObserved()));
- if (isSNP(feature)) s += ":SNP";
- if (isIndel(feature)) s += ":Indel";
- if (isHapmap(feature)) s += ":Hapmap";
- if (is2Hit2Allele(feature)) s += ":2Hit";
- return s;
- }
-
- public static boolean isInsertion(DbSNPFeature feature) {
- return feature.getVariantType().contains("insertion");
- }
-
- public static boolean isDeletion(DbSNPFeature feature) {
- return feature.getVariantType().contains("deletion");
- }
-
- public static boolean isIndel(DbSNPFeature feature) {
- return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || DbSNPHelper.isComplexIndel(feature);
- }
-
- public static boolean isComplexIndel(DbSNPFeature feature) {
- return feature.getVariantType().contains("in-del");
- }
-
- public static boolean isHapmap(DbSNPFeature feature) {
- return feature.getValidationStatus().contains("by-hapmap");
- }
-
- public static boolean is2Hit2Allele(DbSNPFeature feature) {
- return feature.getValidationStatus().contains("by-2hit-2allele");
- }
-
- public static boolean is1000genomes(DbSNPFeature feature) {
- return feature.getValidationStatus().contains("by-1000genomes");
- }
-
- public static boolean isMQ1(DbSNPFeature feature) {
- return feature.getWeight() == 1;
- }
-
- /**
- * gets the alternate alleles. This method should return all the alleles present at the location,
- * NOT including the reference base. This is returned as a string list with no guarantee ordering
- * of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
- * frequency).
- *
- * @return an alternate allele list
- */
- public static List getAlternateAlleleList(DbSNPFeature feature) {
- List ret = new ArrayList();
- for (String allele : getAlleleList(feature))
- if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele);
- return ret;
- }
-
- public static boolean onFwdStrand(DbSNPFeature feature) {
- return feature.getStrand() == Strand.POSITIVE;
- }
-
- public static String getReference(DbSNPFeature feature) {
- return feature.getNCBIRefBase();
- }
-
- public static String toSimpleString(DbSNPFeature feature) {
- return String.format("%s:%s:%s", feature.getRsID(), feature.getObserved(), (feature.getStrand() == Strand.POSITIVE) ? "+" : "-");
- }
-
- /**
- * gets the alleles. This method should return all the alleles present at the location,
- * including the reference base. The first allele should always be the reference allele, followed
- * by an unordered list of alternate alleles.
- *
- * @return an alternate allele list
- */
- public static List getAlleleList(DbSNPFeature feature) {
- List alleleList = new ArrayList();
- // add ref first
- if ( onFwdStrand(feature) )
- alleleList = Arrays.asList(feature.getObserved());
- else
- for (String str : feature.getObserved())
- alleleList.add(SequenceUtil.reverseComplement(str));
- if ( alleleList.size() > 0 && alleleList.contains(getReference(feature)) && !alleleList.get(0).equals(getReference(feature)) )
- Collections.swap(alleleList, alleleList.indexOf(getReference(feature)), 0);
-
- return alleleList;
- }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java
index 26a400071..c99aea254 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/tracks/FeatureManager.java
@@ -36,7 +36,10 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.help.GATKDocUtils;
+import org.broadinstitute.sting.utils.help.HelpUtils;
+import javax.mail.Header;
import java.io.File;
import java.util.*;
@@ -50,7 +53,7 @@ import java.util.*;
* @author depristo
*/
public class FeatureManager {
- public static class FeatureDescriptor {
+ public static class FeatureDescriptor implements Comparable {
final String name;
final FeatureCodec codec;
@@ -62,6 +65,7 @@ public class FeatureManager {
public String getName() {
return name;
}
+ public String getSimpleFeatureName() { return getFeatureClass().getSimpleName(); }
public FeatureCodec getCodec() {
return codec;
}
@@ -70,13 +74,18 @@ public class FeatureManager {
@Override
public String toString() {
- return String.format("FeatureDescriptor name=%s codec=%s feature=%s", getName(), getCodecClass().getName(), getFeatureClass().getName());
+ return String.format("FeatureDescriptor name=%s codec=%s feature=%s",
+ getName(), getCodecClass().getName(), getFeatureClass().getName());
+ }
+
+ @Override
+ public int compareTo(FeatureDescriptor o) {
+ return getName().compareTo(o.getName());
}
}
private final PluginManager pluginManager;
- private final Collection featureDescriptors = new HashSet();
-
+ private final Collection featureDescriptors = new TreeSet();
/**
* Construct a FeatureManager
@@ -114,7 +123,7 @@ public class FeatureManager {
*/
@Requires("featureClass != null")
public Collection getByFeature(Class featureClass) {
- Set consistentDescriptors = new HashSet();
+ Set consistentDescriptors = new TreeSet();
if (featureClass == null)
throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object");
@@ -189,10 +198,40 @@ public class FeatureManager {
*/
@Ensures("result != null")
public String userFriendlyListOfAvailableFeatures() {
- List names = new ArrayList();
- for ( final FeatureDescriptor descriptor : featureDescriptors )
- names.add(descriptor.getName());
- return Utils.join(",", names);
+ return userFriendlyListOfAvailableFeatures(Feature.class);
+ }
+
+ /**
+ * Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load
+ * restricted to only Codecs producting Features consistent with the requiredFeatureType
+ * @return
+ */
+ @Ensures("result != null")
+ public String userFriendlyListOfAvailableFeatures(Class extends Feature> requiredFeatureType) {
+ final String nameHeader="Name", featureHeader = "FeatureType", docHeader="Documentation";
+
+ int maxNameLen = nameHeader.length(), maxFeatureNameLen = featureHeader.length();
+ for ( final FeatureDescriptor descriptor : featureDescriptors ) {
+ if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) {
+ maxNameLen = Math.max(maxNameLen, descriptor.getName().length());
+ maxFeatureNameLen = Math.max(maxFeatureNameLen, descriptor.getSimpleFeatureName().length());
+ }
+ }
+
+ StringBuilder docs = new StringBuilder();
+ String format = "%" + maxNameLen + "s %" + maxFeatureNameLen + "s %s%n";
+ docs.append(String.format(format, nameHeader, featureHeader, docHeader));
+ for ( final FeatureDescriptor descriptor : featureDescriptors ) {
+ if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) {
+ String oneDoc = String.format(format,
+ descriptor.getName(),
+ descriptor.getSimpleFeatureName(),
+ GATKDocUtils.helpLinksToGATKDocs(descriptor.getCodecClass()));
+ docs.append(oneDoc);
+ }
+ }
+
+ return docs.toString();
}
/**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java
index 76b0276cd..bb65d9b09 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java
@@ -30,7 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
+import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
@@ -42,7 +44,6 @@ import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation;
import org.broadinstitute.sting.utils.clipreads.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.ReadUtils;
-import org.yaml.snakeyaml.events.SequenceStartEvent;
import java.io.File;
import java.io.PrintStream;
@@ -51,44 +52,158 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
- * This ReadWalker provides simple, yet powerful read clipping capabilities. It allows the user to clip bases in reads
- * with poor quality scores, that match particular sequences, or that were generated by particular machine cycles.
+ * This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.
+ *
+ *
+ *
+ * It allows the user to clip bases in reads with poor quality scores, that match particular
+ * sequences, or that were generated by particular machine cycles.
+ *
+ *
+ * - Quality score based clipping
+ * -
+ * Clip bases from the read in clipper from
+ *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
+ * to the end of the read. This is blatantly stolen from BWA.
+ *
+ * Walk through the read from the end (in machine cycle order) to the beginning, calculating the
+ * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this
+ * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the
+ * clipping index in the read (from the end).
+ *
+ * - Cycle based clipping
+ * - Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
+ * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions).
+ * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.
+ *
+ * - Sequence matching
+ * - Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm,
+ * filtering only bases whose sequence exactly matches SEQ.
+ *
+ *
+ *
+ *
+ * Input
+ *
+ * Any number of BAM files.
+ *
+ *
+ * Output
+ *
+ * A new BAM file containing all of the reads from the input BAMs with the user-specified clipping
+ * operation applied to each read.
+ *
+ *
+ *
Summary output
+ *
+ * Number of examined reads 13
+ * Number of clipped reads 13
+ * Percent of clipped reads 100.00
+ * Number of examined bases 988
+ * Number of clipped bases 126
+ * Percent of clipped bases 12.75
+ * Number of quality-score clipped bases 126
+ * Number of range clipped bases 0
+ * Number of sequence clipped bases 0
+ *
+ *
+ *
+ *
+ *
Example clipping
+ * Suppose we are given this read:
+ *
+ * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
+ * TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ * #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *
+ *
+ * If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:
+ *
+ *
+ * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
+ * NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ * #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *
+ *
+ * Whereas with -CR WRITE_Q0S:
+ *
+ * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
+ * TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ * !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *
+ *
+ * Or -CR SOFTCLIP_BASES:
+ *
+ * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3133 29 17S59M * * *
+ * TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ * #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *
+ *
+ *
+ * Examples
+ *
+ * -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
+ * -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
+ *
+
+ * @author Mark DePristo
+ * @since 2010
*/
@Requires({DataSource.READS})
public class ClipReadsWalker extends ReadWalker {
- @Output
- PrintStream out;
+ /**
+ * If provided, ClipReads will write summary statistics about the clipping operations applied
+ * to the reads to this file.
+ */
+ @Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false)
+ PrintStream out = null;
/**
- * an optional argument to dump the reads out to a BAM file
+ * The output SAM/BAM file will be written here
*/
- @Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false)
- StingSAMFileWriter outputBam = null;
+ @Output(doc = "Write BAM output here", required = true)
+ StingSAMFileWriter outputBam;
- @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false)
+ /**
+ * If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this
+ * quality score threshold.
+ */
+ @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "If provided, the Q-score clipper will be applied", required = false)
int qTrimmingThreshold = -1;
- @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false)
+ /**
+ * Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
+ * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based
+ * values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11,
+ * and 12.
+ */
+ @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String indicating machine cycles to clip from the reads", required = false)
String cyclesToClipArg = null;
- @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false)
+ /**
+ * Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the
+ * sequences in the file.
+ */
+ @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching the sequences in this FASTA file", required = false)
String clipSequenceFile = null;
+ /**
+ * Clips bases from the reads matching the provided SEQ. Can be provided any number of times on the command line
+ */
@Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false)
String[] clipSequencesArgs = null;
- @Argument(fullName="read", doc="", required=false)
- String onlyDoRead = null;
-
- //@Argument(fullName = "keepCompletelyClipped", shortName = "KCC", doc = "Unfortunately, sometimes a read is completely clipped away but with SOFTCLIP_BASES this results in an invalid CIGAR string. ", required = false)
- //boolean keepCompletelyClippedReads = false;
-
-// @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false)
-// boolean onlyClipFirstSeqMatch = false;
-
+ /**
+ * The different values for this argument determines how ClipReads applies clips to the reads. This can range
+ * from writing Ns over the clipped bases to hard clipping away the bases from the BAM.
+ */
@Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false)
ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS;
+ @Hidden
+ @Advanced
+ @Argument(fullName="read", doc="", required=false)
+ String onlyDoRead = null;
/**
* List of sequence that should be clipped from the reads
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
index 4bfedb672..e2db1dc52 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/DuplicateWalker.java
@@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter;
+import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.utils.GenomeLoc;
@@ -17,7 +17,7 @@ import java.util.Set;
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS,DataSource.REFERENCE})
-@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentReadFilter.class})
+@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class})
public abstract class DuplicateWalker extends Walker {
// Do we actually want to operate on the context?
public boolean filter(GenomeLoc loc, AlignmentContext context, Set> readSets ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java
index b0b2687f4..8152f74c2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/LocusWalker.java
@@ -3,8 +3,8 @@ package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
-import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckReadFilter;
-import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter;
+import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
+import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -18,7 +18,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@By(DataSource.READS)
@Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.INTERVAL)
-@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class})
+@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
public abstract class LocusWalker extends Walker {
// Do we actually want to operate on the context?
public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java
index 6243a6cc0..4d8be4800 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PileupWalker.java
@@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broad.tribble.Feature;
-import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
@@ -34,9 +33,6 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
-import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
-import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
index 7e1dcd707..fdfac6bf7 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReadsWalker.java
@@ -40,26 +40,65 @@ import java.util.TreeSet;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+
/**
- * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear
- * in the input file. It can dynamically merge the contents of multiple input BAM files, resulting
- * in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter
- * command line argument.
+ * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
+ *
+ *
+ * PrintReads can dynamically merge the contents of multiple input BAM files, resulting
+ * in merged output sorted in coordinate order. Can also optionally filter reads based on the
+ * --read_filter command line argument.
+ *
+ *
Input
+ *
+ * One or more bam files.
+ *
+ *
+ * Output
+ *
+ * A single processed bam file.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T PrintReads \
+ * -o output.bam \
+ * -I input1.bam \
+ * -I input2.bam \
+ * --read_filter MappingQualityZero
+ *
+ *
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@Requires({DataSource.READS, DataSource.REFERENCE})
public class PrintReadsWalker extends ReadWalker {
- /** an optional argument to dump the reads out to a BAM file */
+
@Output(doc="Write output to this BAM filename instead of STDOUT")
SAMFileWriter out;
+
@Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false)
String readGroup = null;
+
+ /**
+ * For example, --platform ILLUMINA or --platform 454.
+ */
@Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false)
- String platform = null; // E.g. ILLUMINA, 454
+ String platform = null;
+
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
int nReadsToPrint = -1;
+
+ /**
+ * Only reads from samples listed in the provided file(s) will be included in the output.
+ */
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
public Set sampleFile = new TreeSet();
+
+ /**
+ * Only reads from the sample(s) will be included in the output.
+ */
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
public Set sampleNames = new TreeSet();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java
index 6a2ffe189..cf68a9121 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java
@@ -90,7 +90,7 @@ public class AlleleBalance extends InfoFieldAnnotation {
}
// todo -- actually care about indel length from the pileup (agnostic at the moment)
int refCount = indelPileup.size();
- int altCount = vc.isInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions();
+ int altCount = vc.isSimpleInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions();
if ( refCount + altCount == 0 ) {
continue;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java
index 4102d811c..463f7a645 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java
@@ -79,7 +79,7 @@ public class HomopolymerRun extends InfoFieldAnnotation implements StandardAnnot
GenomeLoc locus = ref.getLocus();
GenomeLoc window = ref.getWindow();
int refBasePos = (int) (locus.getStart() - window.getStart())+1;
- if ( vc.isDeletion() ) {
+ if ( vc.isSimpleDeletion() ) {
// check that deleted bases are the same
byte dBase = bases[refBasePos];
for ( int i = 0; i < vc.getReference().length(); i ++ ) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java
index ff7f9a8f6..bfede40d2 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java
@@ -36,9 +36,9 @@ public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnota
if (!vc.isBiallelic())
type = "MULTIALLELIC_INDEL";
else {
- if (vc.isInsertion())
+ if (vc.isSimpleInsertion())
type = "INS.";
- else if (vc.isDeletion())
+ else if (vc.isSimpleDeletion())
type = "DEL.";
else
type = "OTHER.";
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
index fc5014885..350c683c2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java
@@ -161,19 +161,19 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
public List getDescriptions() {
return Arrays.asList(
- new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID"),
- new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name"),
- new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID"),
- new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID"),
- new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank"),
- new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If present, gene is non-coding"),
- new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "One of the most high-impact effects across all transcripts at this site"),
- new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the effect " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
- new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the effect"),
- new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid"),
- new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon"),
- new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number"),
- new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size")
+ new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"),
+ new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
+ new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
+ new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"),
+ new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant")
);
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
index 8c8bd19d0..96a400c68 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java
@@ -49,7 +49,34 @@ import java.util.*;
/**
- * Annotates variant calls with context information. Users can specify which of the available annotations to use.
+ * Annotates variant calls with context information.
+ *
+ *
+ * VariantAnnotator is a GATK tool for annotating variant calls based on their context.
+ * The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself.
+ *
+ *
Input
+ *
+ * A variant set to annotate and optionally one or more BAM files.
+ *
+ *
+ * Output
+ *
+ * An annotated VCF.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T VariantAnnotator \
+ * -I input.bam \
+ * -o output.vcf \
+ * -A DepthOfCoverage
+ * --variant input.vcf \
+ * --dbsnp dbsnp.vcf
+ *
+ *
*/
@Requires(value={})
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@@ -69,8 +96,6 @@ public class VariantAnnotator extends RodWalker implements Ann
public RodBinding getSnpEffRodBinding() { return snpEffFile; }
/**
- * A dbSNP VCF file from which to annotate.
- *
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
*/
@ArgumentCollection
@@ -101,15 +126,25 @@ public class VariantAnnotator extends RodWalker implements Ann
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
- @Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
- protected String sampleName = null;
-
+ /**
+ * See the -list argument to view available annotations.
+ */
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected List annotationsToUse = new ArrayList();
+ /**
+ * See the -list argument to view available groups.
+ */
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected List annotationGroupsToUse = new ArrayList();
+ /**
+ * This option enables you to add annotations from one VCF to another.
+ *
+ * For example, if you want to annotate your 'variant' VCF with the AC field value from the rod bound to 'resource',
+ * you can specify '-E resource.AC' and records in the output VCF will be annotated with 'resource.AC=N' when a record exists in that rod at the given position.
+ * If multiple records in the rod overlap the given position, one is chosen arbitrarily.
+ */
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false)
protected List expressionsToUse = new ArrayList();
@@ -127,8 +162,6 @@ public class VariantAnnotator extends RodWalker implements Ann
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;
- private HashMap nonVCFsampleName = new HashMap();
-
private VariantAnnotatorEngine engine;
private Collection indelBufferContext;
@@ -164,12 +197,6 @@ public class VariantAnnotator extends RodWalker implements Ann
List rodName = Arrays.asList(variantCollection.variants.getName());
Set samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
- // add the non-VCF sample from the command-line, if applicable
- if ( sampleName != null ) {
- nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
- samples.add(sampleName.toUpperCase());
- }
-
// if there are no valid samples, warn the user
if ( samples.size() == 0 ) {
logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
index a7837813a..01926a7f3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java
@@ -29,15 +29,11 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@@ -158,7 +154,7 @@ public class VariantAnnotatorEngine {
private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) {
for ( Map.Entry, String> dbSet : dbAnnotations.entrySet() ) {
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
- String rsID = DbSNPHelper.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
+ String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
// annotate dbsnp id if available and not already there
if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) )
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
index 40e6748ed..60f0fcb0a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
@@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature;
+import org.broadinstitute.sting.utils.codecs.beagle.BeagleFeature;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
@@ -48,6 +48,31 @@ import static java.lang.Math.log10;
/**
* Takes files produced by Beagle imputation engine and creates a vcf with modified annotations.
+ *
+ * This walker is intended to be run after Beagle has successfully executed. The full calling sequence for using Beagle along with the GATK is:
+ *
+ * 1. Run ProduceBeagleInputWalker.
+ * 2. Run Beagle
+ * 3. Uncompress output files
+ * 4. Run BeagleOutputToVCFWalker.
+ *
+ *
+ * Note that this walker requires all input files produced by Beagle.
+ *
+ *
+ * Example
+ *
+ * java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
+ * -R reffile.fasta -T BeagleOutputToVCF \
+ * -V input_vcf.vcf \
+ * -beagleR2:BEAGLE /myrun.beagle_output.r2 \
+ * -beaglePhased:BEAGLE /myrun.beagle_output.phased \
+ * -beagleProbs:BEAGLE /myrun.beagle_output.gprobs \
+ * -o output_vcf.vcf
+ *
+
+ Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them
+
*/
public class BeagleOutputToVCFWalker extends RodWalker {
@@ -57,22 +82,18 @@ public class BeagleOutputToVCFWalker extends RodWalker {
@Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false)
public RodBinding comp;
- @Input(fullName="beagleR2", shortName = "beagleR2", doc="VCF file", required=true)
+ @Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true)
public RodBinding beagleR2;
- @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="VCF file", required=true)
+ @Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true)
public RodBinding beagleProbs;
- @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="VCF file", required=true)
+ @Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true)
public RodBinding beaglePhased;
- @Output(doc="File to which variants should be written",required=true)
+ @Output(doc="VCF File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
- @Argument(fullName="output_file", shortName="output", doc="Please use --out instead" ,required=false)
- @Deprecated
- protected String oldOutputArg;
-
@Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false)
public boolean DONT_FILTER_MONOMORPHIC_SITES = false;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
index c1508cf83..07793fd7b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
@@ -48,19 +48,45 @@ import java.io.PrintStream;
import java.util.*;
/**
- * Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input variant file
+ * Converts the input VCF into a format accepted by the Beagle imputation/analysis program.
+ *
+ *
+ *
Input
+ *
+ * A VCF with variants to convert to Beagle format
+ *
+ *
+ * Outputs
+ *
+ * A single text file which can be fed to Beagle
+ *
+ *
+ * Optional: A file with a list of markers
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
+ * -R reffile.fasta -T ProduceBeagleInput \
+ * -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
+ *
+ *
*/
+
public class ProduceBeagleInputWalker extends RodWalker {
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
- @Input(fullName="validation", shortName = "validation", doc="Input VCF file", required=false)
+ @Hidden
+ @Input(fullName="validation", shortName = "validation", doc="Validation VCF file", required=false)
public RodBinding validation;
+
@Output(doc="File to which BEAGLE input should be written",required=true)
protected PrintStream beagleWriter = null;
- @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false)
+ @Hidden
+ @Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false)
protected PrintStream markers = null;
int markerCounter = 1;
@@ -73,14 +99,19 @@ public class ProduceBeagleInputWalker extends RodWalker {
@Argument(doc="VQSqual key", shortName = "vqskey", required=false)
protected String VQSLOD_KEY = "VQSqual";
- @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
+ @Hidden
+ @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
public double insertedNoCallRate = 0;
- @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
+ @Hidden
+ @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
public double validationPrior = -1.0;
- @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
+ @Hidden
+ @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
public double bootstrap = 0.0;
- @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
+ @Hidden
+ @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
VCFWriter bootstrapVCFOutput = null;
+
@Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false)
public boolean CHECK_IS_MALE_ON_CHR_X = false;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
index 90e6fcd77..32875a098 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
@@ -22,6 +22,7 @@
package org.broadinstitute.sting.gatk.walkers.coverage;
+import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@@ -42,50 +43,195 @@ import java.io.PrintStream;
/**
* Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome
*
- * @Author depristo
- * @Date May 7, 2010
+ *
+ * A very common question about a NGS set of reads is what areas of the genome are considered callable. The system
+ * considers the coverage at each locus and emits either a per base state or a summary interval BED file that
+ * partitions the genomic intervals into the following callable states:
+ *
+ * - REF_N
+ * - the reference base was an N, which is not considered callable the GATK
+ * - CALLABLE
+ * - the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE
+ * - NO_COVERAGE
+ * - absolutely no reads were seen at this locus, regardless of the filtering parameters
+ * - LOW_COVERAGE
+ * - there were less than min. depth bases at the locus, after applying filters
+ * - EXCESSIVE_COVERAGE
+ * - more than -maxDepth read at the locus, indicating some sort of mapping problem
+ * - POOR_MAPPING_QUALITY
+ * - more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads
+ *
+ *
+ *
+ * Input
+ *
+ * A BAM file containing exactly one sample.
+ *
+ *
+ * Output
+ *
+ *
+ * - -o: a OutputFormatted (recommended BED) file with the callable status covering each base
+ * - -summary: a table of callable status x count of all examined bases
+ *
+ *
+ *
+ * Examples
+ *
+ * -T CallableLociWalker \
+ * -I my.bam \
+ * -summary my.summary \
+ * -o my.bed
+ *
+ *
+ * would produce a BED file (my.bed) that looks like:
+ *
+ *
+ * 20 10000000 10000864 CALLABLE
+ * 20 10000865 10000985 POOR_MAPPING_QUALITY
+ * 20 10000986 10001138 CALLABLE
+ * 20 10001139 10001254 POOR_MAPPING_QUALITY
+ * 20 10001255 10012255 CALLABLE
+ * 20 10012256 10012259 POOR_MAPPING_QUALITY
+ * 20 10012260 10012263 CALLABLE
+ * 20 10012264 10012328 POOR_MAPPING_QUALITY
+ * 20 10012329 10012550 CALLABLE
+ * 20 10012551 10012551 LOW_COVERAGE
+ * 20 10012552 10012554 CALLABLE
+ * 20 10012555 10012557 LOW_COVERAGE
+ * 20 10012558 10012558 CALLABLE
+ * et cetera...
+ *
+ * as well as a summary table that looks like:
+ *
+ *
+ * state nBases
+ * REF_N 0
+ * CALLABLE 996046
+ * NO_COVERAGE 121
+ * LOW_COVERAGE 928
+ * EXCESSIVE_COVERAGE 0
+ * POOR_MAPPING_QUALITY 2906
+ *
+ *
+ * @author Mark DePristo
+ * @since May 7, 2010
*/
@By(DataSource.REFERENCE)
public class CallableLociWalker extends LocusWalker {
@Output
PrintStream out;
- @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read. The gap between this value and mmq are reads that are not sufficiently well mapped for calling but aren't indicative of mapping problems.", required = false)
+ /**
+ * Callable loci summary counts (see outputs) will be written to this file.
+ */
+ @Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
+ File summaryFile;
+
+ /**
+ * The gap between this value and mmq are reads that are not sufficiently well mapped for calling but
+ * aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with
+ * MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where
+ * reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to
+ * calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states
+ */
+ @Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false)
byte maxLowMAPQ = 1;
- @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
+ /**
+ * Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE
+ * state.
+ */
+ @Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
byte minMappingQuality = 10;
- @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
+ /**
+ * Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state
+ */
+ @Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
byte minBaseQuality = 20;
+ /**
+ * If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
+ * value and is less than maxDepth the site is considered CALLABLE.
+ */
+ @Advanced
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
int minDepth = 4;
+ /**
+ * If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH
+ */
@Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
int maxDepth = -1;
+ /**
+ * We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We
+ * won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads
+ * covering the site.
+ */
+ @Advanced
@Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false)
int minDepthLowMAPQ = 10;
- @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
+ /**
+ * If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality
+ * exceeds this fraction then the site has POOR_MAPPING_QUALITY.
+ */
+ @Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false)
double maxLowMAPQFraction = 0.1;
- @Argument(fullName = "format", shortName = "format", doc = "Output format for the system: either BED or STATE_PER_BASE", required = false)
+ /**
+ * The output of this walker will be written in this format. The recommended option is BED.
+ */
+ @Advanced
+ @Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
OutputFormat outputFormat;
- @Argument(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
- File summaryFile;
+ public enum OutputFormat {
+ /**
+ * The output will be written as a BED file. There's a BED element for each
+ * continuous run of callable states (i.e., CALLABLE, REF_N, etc). This is the recommended
+ * format
+ */
+ BED,
- public enum OutputFormat { BED, STATE_PER_BASE }
+ /**
+ * Emit chr start stop state quads for each base. Produces a potentially disasterously
+ * large amount of output.
+ */
+ STATE_PER_BASE
+ }
+
+ public enum CalledState {
+ /** the reference base was an N, which is not considered callable the GATK */
+ REF_N,
+ /** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
+ CALLABLE,
+ /** absolutely no reads were seen at this locus, regardless of the filtering parameters */
+ NO_COVERAGE,
+ /** there were less than min. depth bases at the locus, after applying filters */
+ LOW_COVERAGE,
+ /** more than -maxDepth read at the locus, indicating some sort of mapping problem */
+ EXCESSIVE_COVERAGE,
+ /** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
+ POOR_MAPPING_QUALITY
+ }
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
////////////////////////////////////////////////////////////////////////////////////
+ @Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
+ @Override
public void initialize() {
+ if ( getToolkit().getSamples().size() != 2 ) {
+ // unbelievably there are actually two samples even when there's just one in the header. God I hate this Samples system
+ throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getToolkit().getSamples());
+ }
+
try {
PrintStream summaryOut = new PrintStream(summaryFile);
summaryOut.close();
@@ -94,15 +240,15 @@ public class CallableLociWalker extends LocusWalker
+ * DepthOfCoverage processes a set of bam files to determine coverage at different levels of partitioning and
+ * aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by
+ * sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles,
+ * and/or percentage of bases covered to or beyond a threshold.
+ * Additionally, reads and bases can be filtered by mapping or base quality score.
+ *
+ * Input
+ *
+ * One or more bam files (with proper headers) to be analyzed for coverage statistics
+ * (Optional) A REFSEQ Rod to aggregate coverage to the gene level
+ *
+ *
+ * Output
+ *
+ * Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
+ * - no suffix: per locus coverage
+ * - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
+ * - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
+ * - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
+ * - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
+ * - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
+ * - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
+ * - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
+ * - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T VariantEval \
+ * -o file_name_base \
+ * -I input_bams.list
+ * [-geneList refSeq.sorted.txt] \
+ * [-pt readgroup] \
+ * [-ct 4 -ct 6 -ct 10] \
+ * [-L my_capture_genes.interval_list]
+ *
*
- * @Author chartl
- * @Date Feb 22, 2010
*/
// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time
// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
index a4944e939..5c2a967b9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
@@ -38,12 +38,32 @@ import java.util.List;
/**
* Walks along reference and calculates the GC content for each interval.
+ *
+ *
+ * Input
+ *
+ * One or more BAM files.
+ *
+ *
+ * Output
+ *
+ * GC content calculations per interval.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T GCContentByInterval \
+ * -o output.txt \
+ * -I input.bam \
+ * -L input.intervals
+ *
+ *
*/
@Allows(value = {DataSource.REFERENCE})
@Requires(value = {DataSource.REFERENCE})
-
@By(DataSource.REFERENCE)
-
public class GCContentByIntervalWalker extends LocusWalker {
@Output
protected PrintStream out;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
index 60f9724e8..fd912334f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java
@@ -35,22 +35,53 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import java.util.Collections;
import java.util.List;
/**
- * Generates an alternative reference sequence over the specified interval. Given variant ROD tracks,
- * it replaces the reference bases at variation sites with the bases supplied by the ROD(s). Additionally,
- * allows for a "snpmask" ROD to set overlapping bases to 'N'.
+ * Generates an alternative reference sequence over the specified interval.
+ *
+ *
+ * Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
+ * Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
+ *
+ *
Input
+ *
+ * The reference, requested intervals, and any number of variant rod files.
+ *
+ *
+ * Output
+ *
+ * A fasta file representing the requested intervals.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T FastaAlternateReferenceMaker \
+ * -o output.fasta \
+ * -L input.intervals \
+ * --variant input.vcf \
+ * [--snpmask mask.vcf]
+ *
+ *
*/
@WalkerName("FastaAlternateReferenceMaker")
@Reference(window=@Window(start=-1,stop=50))
@Requires(value={DataSource.REFERENCE})
public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
+ /**
+ * Variants from these input files are used by this tool to construct an alternate reference.
+ */
@Input(fullName = "variant", shortName = "V", doc="variants to model", required=false)
- public List> variants;
+ public List> variants = Collections.emptyList();
+ /**
+ * Snps from this file are used as a mask when constructing the alternate reference.
+ */
@Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false)
public RodBinding snpmask;
@@ -66,17 +97,18 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
String refBase = String.valueOf((char)ref.getBase());
// Check to see if we have a called snp
- for ( VariantContext vc : tracker.getValues(VariantContext.class) ) {
- if ( ! vc.getSource().equals(snpmask.getName())) {
- if ( vc.isDeletion()) {
- deletionBasesRemaining = vc.getReference().length();
- // delete the next n bases, not this one
- return new Pair(context.getLocation(), refBase);
- } else if ( vc.isInsertion()) {
- return new Pair(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString()));
- } else if (vc.isSNP()) {
- return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString());
- }
+ for ( VariantContext vc : tracker.getValues(variants) ) {
+ if ( vc.isFiltered() )
+ continue;
+
+ if ( vc.isSimpleDeletion()) {
+ deletionBasesRemaining = vc.getReference().length();
+ // delete the next n bases, not this one
+ return new Pair(context.getLocation(), refBase);
+ } else if ( vc.isSimpleInsertion()) {
+ return new Pair(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString()));
+ } else if (vc.isSNP()) {
+ return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString());
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
index 2dbfc76ff..5f3b37cc8 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceWalker.java
@@ -38,14 +38,44 @@ import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
/**
- * Renders a new reference in FASTA format consisting of only those loci provided in the input data set. Has optional
- * features to control the output format.
+ * Renders a new reference in FASTA format consisting of only those loci provided in the input data set.
+ *
+ *
+ * The output format can be partially controlled using the provided command-line arguments.
+ *
+ *
Input
+ *
+ * The reference and requested intervals.
+ *
+ *
+ * Output
+ *
+ * A fasta file representing the requested intervals.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T FastaReference \
+ * -o output.fasta \
+ * -L input.intervals
+ *
+ *
*/
@WalkerName("FastaReferenceMaker")
public class FastaReferenceWalker extends RefWalker, GenomeLoc> {
+
@Output PrintStream out;
- @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) public int fastaLineWidth=60;
- @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity) - CAUTION: adjacent intervals will automatically be merged", required=false) public boolean fastaRawSeqs=false;
+
+ @Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false)
+ public int fastaLineWidth=60;
+
+ /**
+ * Please note that when using this argument adjacent intervals will automatically be merged.
+ */
+ @Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false)
+ public boolean fastaRawSeqs=false;
protected FastaSequence fasta;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java
index c555e88cd..bf3606b54 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java
@@ -45,6 +45,34 @@ import java.util.*;
/**
* Filters variant calls using a number of user-selectable, parameterizable criteria.
+ *
+ *
+ * VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria.
+ * Records are hard-filtered by changing the value in the FILTER field to something other than PASS.
+ *
+ *
Input
+ *
+ * A variant set to filter.
+ *
+ *
+ * Output
+ *
+ * A filtered VCF.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T VariantFiltration \
+ * -o output.vcf \
+ * --variant input.vcf \
+ * --filterExpression "AB < 0.2 || MQ0 > 50" \
+ * --filterName "Nov09filters" \
+ * --mask mask.vcf \
+ * --maskName InDel
+ *
+ *
*/
@Reference(window=@Window(start=-50,stop=50))
public class VariantFiltrationWalker extends RodWalker {
@@ -52,33 +80,65 @@ public class VariantFiltrationWalker extends RodWalker {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
+ /**
+ * Any variant which overlaps entries from the provided mask rod will be filtered.
+ */
@Input(fullName="mask", doc="Input ROD mask", required=false)
public RodBinding mask;
@Output(doc="File to which variants should be written", required=true)
protected VCFWriter writer = null;
- @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter (see wiki docs for more info)", required=false)
+ /**
+ * VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using
+ * --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
+ */
+ @Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
protected ArrayList FILTER_EXPS = new ArrayList();
- @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
+
+ /**
+ * This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
+ */
+ @Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
protected ArrayList FILTER_NAMES = new ArrayList();
+ /**
+ * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
+ * VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag).
+ * One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
+ * methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1").
+ */
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false)
protected ArrayList GENOTYPE_FILTER_EXPS = new ArrayList();
+
+ /**
+ * Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
+ */
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
protected ArrayList GENOTYPE_FILTER_NAMES = new ArrayList();
- @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]", required=false)
+ /**
+ * Works together with the --clusterWindowSize argument.
+ */
+ @Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false)
protected Integer clusterSize = 3;
- @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]", required=false)
+
+ /**
+ * Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1.
+ */
+ @Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false)
protected Integer clusterWindow = 0;
- @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered; [default:0]", required=false)
+ @Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false)
protected Integer MASK_EXTEND = 0;
- @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false)
+ @Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
protected String MASK_NAME = "Mask";
- @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)?", required=false)
+ /**
+ * By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing.
+ * Use this argument to have it evaluate as failing filters instead for these cases.
+ */
+ @Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
protected Boolean FAIL_MISSING_VALUES = false;
// JEXL expressions for the filters
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java
index 83a8ce7d7..70f3c6a1a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java
@@ -44,7 +44,9 @@ import java.util.Set;
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
public enum Model {
+ /** The default model with the best performance in all cases */
EXACT,
+ /** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */
GRID_SEARCH
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
index 594c1dd28..60dfe4fe7 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java
@@ -53,7 +53,9 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
}
public enum GENOTYPING_MODE {
+ /** the default; the Unified Genotyper will choose the most likely alternate allele */
DISCOVERY,
+ /** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */
GENOTYPE_GIVEN_ALLELES
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
index 1a76bfd07..e7f89bf08 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@@ -36,31 +36,54 @@ import java.io.File;
public class UnifiedArgumentCollection {
- // control the various models to be used
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
+ /**
+ * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
+ */
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
+ /**
+ * The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are:
+ * het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
+ */
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
@Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false)
public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE;
+ /**
+ * Specifies how to determine the alternate allele to use for genotyping
+ */
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
+ /**
+ * The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
+ * confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
+ * is the default). Note that the confidence (QUAL) values for multi-sample low-pass (e.g. 4x per sample) calling might
+ * be significantly smaller with the new EXACT model than with our older GRID_SEARCH model, as the latter tended to
+ * over-estimate the confidence; for low-pass calling we tend to use much smaller thresholds (e.g. 4).
+ */
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
+ /**
+ * the minimum phred-scaled Qscore threshold to emit low confidence calls. Genotypes with confidence >= this but less
+ * than the calling threshold are emitted but marked as filtered.
+ */
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
+ /**
+ * This argument is not enabled by default because it increases the runtime by an appreciable amount.
+ */
@Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false)
public boolean COMPUTE_SLOD = false;
@@ -80,7 +103,6 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "abort_at_too_much_coverage", doc = "Don't call a site if the downsampled coverage is greater than this value", required = false)
public int COVERAGE_AT_WHICH_TO_ABORT = -1;
-
// control the various parameters to be used
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public int MIN_BASE_QUALTY_SCORE = 17;
@@ -91,11 +113,17 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
-
// indel-related arguments
+ /**
+ * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
+ * Decreasing this value will increase sensitivity but at the cost of larger calling time and a larger number of false positives.
+ */
@Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false)
public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5;
+ /**
+ * This argument informs the prior probability of having an indel at a site.
+ */
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
public double INDEL_HETEROZYGOSITY = 1.0/8000;
@@ -126,22 +154,23 @@ public class UnifiedArgumentCollection {
@Hidden
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
+
@Hidden
@Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false)
public boolean dovit = false;
+
@Hidden
@Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false)
public boolean GSA_PRODUCTION_ONLY = false;
+
@Hidden
-
@Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false)
public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL;
@Hidden
- @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
+ @Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
public boolean IGNORE_SNP_ALLELES = false;
-
@Deprecated
@Argument(fullName="output_all_callable_bases", shortName="all_bases", doc="Please use --output_mode EMIT_ALL_SITES instead" ,required=false)
private Boolean ALL_BASES_DEPRECATED = false;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
index d31bb6fb9..d5dbdedd6 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java
@@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
-import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter;
+import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
@@ -45,13 +45,73 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
import java.util.*;
-
/**
- * A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
- * multi-sample data. The user can choose from several different incorporated calculation models.
+ * A variant caller which unifies the approaches of several disparate callers -- Works for single-sample and multi-sample data.
+ *
+ *
+ * The GATK Unified Genotyper is a multiple-sample, technology-aware SNP and indel caller. It uses a Bayesian genotype
+ * likelihood model to estimate simultaneously the most likely genotypes and allele frequency in a population of N samples,
+ * emitting an accurate posterior probability of there being a segregating variant allele at each locus as well as for the
+ * genotype of each sample. The system can either emit just the variant sites or complete genotypes (which includes
+ * homozygous reference calls) satisfying some phred-scaled confidence value. The genotyper can make accurate calls on
+ * both single sample data and multi-sample data.
+ *
+ *
Input
+ *
+ * The read data from which to make variant calls.
+ *
+ *
+ * Output
+ *
+ * A raw, unfiltered, highly specific callset in VCF format.
+ *
+ *
+ * Example generic command for multi-sample SNP calling
+ *
+ * java -jar GenomeAnalysisTK.jar \
+ * -R resources/Homo_sapiens_assembly18.fasta \
+ * -T UnifiedGenotyper \
+ * -I sample1.bam [-I sample2.bam ...] \
+ * --dbsnp dbSNP.vcf \
+ * -o snps.raw.vcf \
+ * -stand_call_conf [50.0] \
+ * -stand_emit_conf 10.0 \
+ * -dcov [50] \
+ * [-L targets.interval_list]
+ *
+ *
+ *
+ * The above command will call all of the samples in your provided BAM files [-I arguments] together and produce a VCF file
+ * with sites and genotypes for all samples. The easiest way to get the dbSNP file is from the GATK resource bundle. Several
+ * arguments have parameters that should be chosen based on the average coverage per sample in your data. See the detailed
+ * argument descriptions below.
+ *
+ *
+ * Example command for generating calls at all sites
+ *
+ * java -jar /path/to/GenomeAnalysisTK.jar \
+ * -l INFO \
+ * -R resources/Homo_sapiens_assembly18.fasta \
+ * -T UnifiedGenotyper \
+ * -I /DCC/ftp/pilot_data/data/NA12878/alignment/NA12878.SLX.maq.SRP000031.2009_08.bam \
+ * -o my.vcf \
+ * --output_mode EMIT_ALL_SITES
+ *
+ *
+ * Caveats
+ *
+ * - The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and
+ * file formats are likely to change.
+ * - The system can be very aggressive in calling variants. In the 1000 genomes project for pilot 2 (deep coverage of ~35x)
+ * we expect the raw Qscore > 50 variants to contain at least ~10% FP calls. We use extensive post-calling filters to eliminate
+ * most of these FPs. Variant Quality Score Recalibration is a tool to perform this filtering.
+ * - We only handle diploid genotypes
+ *
+ *
*/
+
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
-@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableReadFilter.class} )
+@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} )
@Reference(window=@Window(start=-200,stop=200))
@By(DataSource.REFERENCE)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
@@ -61,10 +121,9 @@ public class UnifiedGenotyper extends LocusWalker getDbsnpRodBinding() { return dbsnp.dbsnp; }
@@ -72,7 +131,9 @@ public class UnifiedGenotyper extends LocusWalker> getCompRodBindings() { return Collections.emptyList(); }
public List> getResourceRodBindings() { return Collections.emptyList(); }
- // control the output
+ /**
+ * A raw, unfiltered, highly specific callset in VCF format.
+ */
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
@@ -82,9 +143,15 @@ public class UnifiedGenotyper extends LocusWalker annotationsToUse = new ArrayList();
+ /**
+ * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
+ */
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected String[] annotationClassesToUse = { "Standard" };
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
index dc728ff6b..06455df6d 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java
@@ -51,8 +51,11 @@ public class UnifiedGenotyperEngine {
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
public enum OUTPUT_MODE {
+ /** the default */
EMIT_VARIANTS_ONLY,
+ /** include confident reference sites */
EMIT_ALL_CONFIDENT_SITES,
+ /** any callable site regardless of confidence */
EMIT_ALL_SITES
}
@@ -484,6 +487,9 @@ public class UnifiedGenotyperEngine {
Map stratifiedContexts = null;
+ if ( !BaseUtils.isRegularBase( refContext.getBase() ) )
+ return null;
+
if ( model == GenotypeLikelihoodsCalculationModel.Model.INDEL ) {
if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
@@ -498,6 +504,7 @@ public class UnifiedGenotyperEngine {
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
} else {
+
// todo - tmp will get rid of extended events so this wont be needed
if (!rawContext.hasExtendedEventPileup())
return null;
@@ -515,9 +522,6 @@ public class UnifiedGenotyperEngine {
}
} else if ( model == GenotypeLikelihoodsCalculationModel.Model.SNP ) {
- if ( !BaseUtils.isRegularBase( refContext.getBase() ) )
- return null;
-
// stratify the AlignmentContext and cut by sample
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
index fa3991694..8680f3537 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java
@@ -65,10 +65,53 @@ import java.util.*;
/**
* Performs local realignment of reads based on misalignments due to the presence of indels.
- * Unlike most mappers, this walker uses the full alignment context to determine whether an
- * appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly.
+ *
+ *
+ * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
+ * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
+ * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
+ * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
+ * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
+ * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
+ * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus
+ * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an
+ * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and
+ * specifically identify indels.
+ *
+ *
There are 2 steps to the realignment process:
+ * - Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)
+ * - Running the realigner over those intervals (IndelRealigner)
+ *
+ *
+ * An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step.
+ *
+ * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
+ * (or with reads from similar technologies).
+ *
+ *
Input
+ *
+ * One or more aligned BAM files and optionally one or more lists of known indels.
+ *
+ *
+ * Output
+ *
+ * A realigned version of your input BAM file(s).
+ *
+ *
+ * Examples
+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ * -I input.bam \
+ * -R ref.fasta \
+ * -T IndelRealigner \
+ * -targetIntervals intervalListFromRTC.intervals \
+ * -o realignedBam.bam \
+ * [--known /path/to/indels.vcf] \
+ * [-compress 0] (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
+ *
+ *
+ * @author ebanks
*/
-//Reference(window=@Window(start=-30,stop=30))
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
public class IndelRealigner extends ReadWalker {
@@ -77,88 +120,145 @@ public class IndelRealigner extends ReadWalker {
public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner";
public enum ConsensusDeterminationModel {
+ /**
+ * Uses only indels from a provided ROD of known indels.
+ */
KNOWNS_ONLY,
+ /**
+ * Additionally uses indels already present in the original alignments of the reads.
+ */
USE_READS,
+ /**
+ * Additionally uses 'Smith-Waterman' to generate alternate consenses.
+ */
USE_SW
}
- @Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false)
+ /**
+ * Any number of VCF files representing known indels to be used for constructing alternate consenses.
+ * Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored.
+ */
+ @Input(fullName="known", shortName = "known", doc="Input VCF file(s) with known indels", required=false)
public List> known = Collections.emptyList();
+ /**
+ * The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s).
+ */
@Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
protected String intervalsFile = null;
+ /**
+ * This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number
+ * should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency,
+ * this number should be smaller.
+ */
@Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
protected double LOD_THRESHOLD = 5.0;
- @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
- protected double MISMATCH_THRESHOLD = 0.15;
-
+ /**
+ * The realigned bam file.
+ */
@Output(required=false, doc="Output bam")
protected StingSAMFileWriter writer = null;
protected ConstrainedMateFixingManager manager = null;
protected SAMFileWriter writerToUse = null;
- @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false)
+ /**
+ * We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner;
+ * Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data).
+ */
+ @Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false)
public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS;
// ADVANCED OPTIONS FOLLOW
- @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
- "Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).", required=false)
+ /**
+ * For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner
+ * will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply
+ * push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set.
+ */
+ @Advanced
+ @Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
+ protected double MISMATCH_THRESHOLD = 0.15;
+
+ /**
+ * For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage;
+ * and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value.
+ */
+ @Advanced
+ @Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false)
protected int MAX_RECORDS_IN_MEMORY = 150000;
+ /**
+ * For expert users only!
+ */
+ @Advanced
@Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false)
protected int MAX_ISIZE_FOR_MOVEMENT = 3000;
+ /**
+ * For expert users only!
+ */
+ @Advanced
@Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="maximum positional move in basepairs that a read can be adjusted during realignment", required=false)
protected int MAX_POS_MOVE_ALLOWED = 200;
+ /**
+ * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number.
+ */
+ @Advanced
@Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false)
protected int MAX_CONSENSUSES = 30;
+ /**
+ * For expert users only! If you need to find the optimal solution regardless of running time, use a higher number.
+ */
+ @Advanced
@Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false)
protected int MAX_READS_FOR_CONSENSUSES = 120;
- @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment; "+
- "if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is", required=false)
+ /**
+ * For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is.
+ * If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number.
+ */
+ @Advanced
+ @Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false)
protected int MAX_READS = 20000;
- @Argument(fullName="noPGTag", shortName="noPG", required=false,
- doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. "+
- "This option is required in order to pass integration tests.")
- protected boolean NO_PG_TAG = false;
-
- @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false,
- doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.")
+ @Advanced
+ @Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam")
protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false;
- @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false,
- doc="This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, "+
- "it will throw an exception. Use this argument when your interval list is not sorted to instruct "+"" +
- "the Realigner to first sort it in memory.")
+ /**
+ * For expert users only! This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception.
+ * Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory.
+ */
+ @Advanced
+ @Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="The target intervals are not sorted")
protected boolean TARGET_NOT_SORTED = false;
- //NWay output: testing, not ready for the prime time, hence hidden:
-
- @Hidden
- @Argument(fullName="nWayOut", shortName="nWayOut", required=false,
- doc="Generate one output file for each input (-I) bam file. Reads from all input files "+
- "will be realigned together, but then each read will be saved in the output file corresponding to "+
- "the input file the read came from. There are two ways to generate output bam file names: 1) if the "+
- "value of this argument is a general string (e.g. '.cleaned.bam'), then "+
- "extensions (\".bam\" or \".sam\") will be stripped from the input file names and the provided string value "+
- "will be pasted on instead; 2) if the value ends with a '.map' (e.g. input_output.map), then " +
- "the two-column tab-separated file with the specified name must exist and list unique output file name (2nd column)" +
- "for each input file name (1st column).")
+ /**
+ * Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that
+ * the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'),
+ * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the
+ * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output
+ * file name (2nd column) for each input file name (1st column).
+ */
+ @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file")
protected String N_WAY_OUT = null;
+
+
+ // DEBUGGING OPTIONS FOLLOW
+
@Hidden
@Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses")
protected boolean CHECKEARLY = false;
-
- // DEBUGGING OPTIONS FOLLOW
+ @Hidden
+ @Argument(fullName="noPGTag", shortName="noPG", required=false,
+ doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
+ protected boolean NO_PG_TAG = false;
@Hidden
@Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY")
@@ -786,7 +886,7 @@ public class IndelRealigner extends ReadWalker {
for ( VariantContext knownIndel : knownIndelsToTry ) {
if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() )
continue;
- byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length());
+ byte[] indelStr = knownIndel.isSimpleInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length());
int start = knownIndel.getStart() - leftmostIndex + 1;
Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel);
if ( c != null )
@@ -988,11 +1088,11 @@ public class IndelRealigner extends ReadWalker {
if ( indexOnRef > 0 )
cigar.add(new CigarElement(indexOnRef, CigarOperator.M));
- if ( indel.isDeletion() ) {
+ if ( indel.isSimpleDeletion() ) {
refIdx += indelStr.length;
cigar.add(new CigarElement(indelStr.length, CigarOperator.D));
}
- else if ( indel.isInsertion() ) {
+ else if ( indel.isSimpleInsertion() ) {
for ( byte b : indelStr )
sb.append((char)b);
cigar.add(new CigarElement(indelStr.length, CigarOperator.I));
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java
index af8051334..17d5a8e9b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java
@@ -35,16 +35,46 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
+
/**
- * Left aligns indels in reads.
+ * Left-aligns indels from reads in a bam file.
+ *
+ *
+ * LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be
+ * placed at multiple positions and still represent the same haplotype. While a standard convention is to place an
+ * indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
+ *
+ *
Input
+ *
+ * A bam file to left-align.
+ *
+ *
+ * Output
+ *
+ * A left-aligned bam.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx3g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T LeftAlignIndels \
+ * -I input.bam \
+ * -o output.vcf
+ *
+ *
*/
public class LeftAlignIndels extends ReadWalker {
@Output(required=false, doc="Output bam")
protected StingSAMFileWriter writer = null;
- @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
- "If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false)
+ /**
+ * If set too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool
+ * may run out of memory. We recommend that you additionally tell Java to use a temp directory with plenty of available
+ * space (by setting java.io.tempdir on the command-line).
+ */
+ @Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the output writer", required=false)
protected int MAX_RECORDS_IN_RAM = 500000;
public void initialize() {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
index fbb62f17e..bede50a0b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java
@@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.BadCigarFilter;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
-import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
+import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
@@ -52,38 +52,94 @@ import java.util.Collections;
import java.util.List;
/**
- * Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string.
+ * Emits intervals for the Local Indel Realigner to target for realignment.
+ *
+ *
+ * The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
+ * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
+ * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
+ * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
+ * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
+ * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
+ * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus
+ * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an
+ * appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and
+ * specifically identify indels.
+ *
+ *
There are 2 steps to the realignment process:
+ * - Determining (small) suspicious intervals which are likely in need of realignment (RealignerTargetCreator)
+ * - Running the realigner over those intervals (see the IndelRealigner tool)
+ *
+ *
+ * An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.
+ *
+ * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
+ * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.
+ *
+ *
Input
+ *
+ * One or more aligned BAM files and optionally one or more lists of known indels.
+ *
+ *
+ * Output
+ *
+ * A list of target intervals to pass to the Indel Realigner.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -I input.bam \
+ * -R ref.fasta \
+ * -T RealignerTargetCreator \
+ * -o forIndelRealigner.intervals \
+ * [--known /path/to/indels.vcf]
+ *
+ *
+ * @author ebanks
*/
-@ReadFilters({Platform454Filter.class, MappingQualityZeroReadFilter.class, BadCigarFilter.class})
+@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, BadCigarFilter.class})
@Reference(window=@Window(start=-1,stop=50))
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@By(DataSource.REFERENCE)
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
public class RealignerTargetCreator extends RodWalker {
+ /**
+ * The target intervals for realignment.
+ */
@Output
protected PrintStream out;
+ /**
+ * Any number of VCF files representing known SNPs and/or indels. Could be e.g. dbSNP and/or official 1000 Genomes indel calls.
+ * SNPs in these files will be ignored unless the --mismatchFraction argument is used.
+ */
@Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false)
public List> known = Collections.emptyList();
- // mismatch/entropy/SNP arguments
+ /**
+ * Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart.
+ */
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false)
protected int windowSize = 10;
- @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false)
+ /**
+ * To disable this behavior, set this value to <= 0 or > 1. This feature is really only necessary when using an ungapped aligner
+ * (e.g. MAQ in the case of single-end read data) and should be used in conjunction with '--model USE_SW' in the IndelRealigner.
+ */
+ @Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy", required=false)
protected double mismatchThreshold = 0.0;
@Argument(fullName="minReadsAtLocus", shortName="minReads", doc="minimum reads at a locus to enable using the entropy calculation", required=false)
protected int minReadsAtLocus = 4;
- // interval merging arguments
+ /**
+ * Because the realignment algorithm is N^2, allowing too large an interval might take too long to completely realign.
+ */
@Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="maximum interval size", required=false)
protected int maxIntervalSize = 500;
- @Deprecated
- @Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false)
- protected boolean DEPRECATED_REALIGN_MATES = false;
@Override
public boolean generateExtendedEvents() { return true; }
@@ -122,7 +178,7 @@ public class RealignerTargetCreator extends RodWalker {
// @Output
// PrintStream out;
@@ -469,10 +469,20 @@ public class SomaticIndelDetectorWalker extends ReadWalker {
// let's double check now that the read fits after the shift
if ( read.getAlignmentEnd() > normal_context.getStop()) {
// ooops, looks like the read does not fit into the window even after the latter was shifted!!
- throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+
- "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
+ // we used to die over such reads and require user to run with larger window size. Now we
+ // just print a warning and discard the read (this means that our counts can be slightly off in
+ // th epresence of such reads)
+ //throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+
+ // "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
+ // read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+
+ // "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop());
+ System.out.println("WARNING: Read "+read.getReadName()+
+ " is out of coverage window bounds. Probably window is too small and the window_size value must be increased.\n"+
+ " The read is ignored in this run (so all the counts/statistics reported will not include it).\n"+
+ " Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+
"; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop());
+ return 1;
}
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
index ac4fba4b4..17a6e20f1 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java
@@ -23,12 +23,15 @@
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
-import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.ArgumentCollection;
+import org.broadinstitute.sting.commandline.Hidden;
+import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
-import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
+import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.BaseUtils;
@@ -49,16 +52,46 @@ import java.util.*;
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
-
/**
* Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads).
+ *
+ *
+ * Performs physical phasing of SNP calls, based on sequencing reads.
+ *
+ *
+ * Input
+ *
+ * VCF file of SNP calls, BAM file of sequence reads.
+ *
+ *
+ * Output
+ *
+ * Phased VCF file.
+ *
+ *
+ * Examples
+ *
+ * java
+ * -jar GenomeAnalysisTK.jar
+ * -T ReadBackedPhasing
+ * -R reference.fasta
+ * -I reads.bam
+ * --variant:vcf SNPs.vcf
+ * -BTI variant
+ * -BTIMR INTERSECTION
+ * -o phased_SNPs.vcf
+ * --phaseQualityThresh 20.0
+ *
+ *
+ * @author Menachem Fromer
+ * @since July 2010
*/
@Allows(value = {DataSource.READS, DataSource.REFERENCE})
@Requires(value = {DataSource.READS, DataSource.REFERENCE})
@By(DataSource.READS)
-@ReadFilters({MappingQualityZeroReadFilter.class})
// Filter out all reads with zero mapping quality
+@ReadFilters({MappingQualityZeroFilter.class})
public class ReadBackedPhasingWalker extends RodWalker {
private static final boolean DEBUG = false;
@@ -73,13 +106,13 @@ public class ReadBackedPhasingWalker extends RodWalker P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9
@Hidden
@@ -87,10 +120,10 @@ public class ReadBackedPhasingWalker extends RodWalker {
@Output
PrintStream out;
+ @Input(fullName="check", shortName = "check", doc="Any number of RODs", required=false)
+ public List> features = Collections.emptyList();
+
@Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false)
int numOverlaps = 2;
@@ -37,7 +42,7 @@ public class CountIntervals extends RefWalker {
return null;
}
- List checkIntervals = tracker.getValues(Feature.class, "check");
+ List checkIntervals = tracker.getValues(features);
return (long) checkIntervals.size();
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java
index 0d68c8493..09113704a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLociWalker.java
@@ -11,7 +11,31 @@ import java.io.PrintStream;
/**
* Walks over the input data set, calculating the total number of covered loci for diagnostic purposes.
+ *
+ *
* Simplest example of a locus walker.
+ *
+ *
+ *
Input
+ *
+ * One or more BAM files.
+ *
+ *
+ * Output
+ *
+ * Number of loci traversed.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountLoci \
+ * -o output.txt \
+ * -I input.bam \
+ * [-L input.intervals]
+ *
+ *
*/
public class CountLociWalker extends LocusWalker implements TreeReducible {
@Output(doc="Write count to this file instead of STDOUT")
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
index 26fa9a258..e770418c1 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountPairsWalker.java
@@ -39,6 +39,26 @@ import java.util.List;
* query name order. Breaks counts down by total pairs and number
* of paired reads.
*
+ *
+ * Input
+ *
+ * One or more bam files.
+ *
+ *
+ * Output
+ *
+ * Number of pairs seen.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountPairs \
+ * -o output.txt \
+ * -I input.bam
+ *
+ *
* @author mhanna
*/
public class CountPairsWalker extends ReadPairWalker {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java
similarity index 62%
rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java
rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java
index d1545f159..7c7d6417a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodByRefWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsByRefWalker.java
@@ -25,7 +25,10 @@
package org.broadinstitute.sting.gatk.walkers.qc;
+import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Input;
+import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -33,25 +36,55 @@ import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.Pair;
+import java.util.Collections;
+import java.util.List;
+
/**
- * Prints out counts of the number of reference ordered data objects are
- * each locus for debugging RefWalkers.
+ * Prints out counts of the number of reference ordered data objects encountered.
+ *
+ *
+ * Input
+ *
+ * One or more rod files.
+ *
+ *
+ * Output
+ *
+ * Number of rods seen.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountRODsByRef \
+ * -o output.txt \
+ * --rod input.vcf
+ *
+ *
*/
-public class CountRodByRefWalker extends RefWalker, Long>> {
- @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
+public class CountRODsByRefWalker extends RefWalker, Long>> {
+
+ /**
+ * One or more input rod files
+ */
+ @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
+ public List> rods = Collections.emptyList();
+
+ @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
- @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
+ @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
public boolean showSkipped = false;
- CountRodWalker crw = new CountRodWalker();
+ CountRODsWalker crw = new CountRODsWalker();
public void initialize() {
crw.verbose = verbose;
crw.showSkipped = showSkipped;
}
- public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
+ public CountRODsWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return crw.map(tracker, ref, context);
}
@@ -59,7 +92,7 @@ public class CountRodByRefWalker extends RefWalker, Long> reduce(CountRodWalker.Datum point, Pair, Long> sum) {
+ public Pair, Long> reduce(CountRODsWalker.Datum point, Pair, Long> sum) {
return crw.reduce(point, sum);
}
}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java
similarity index 87%
rename from public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java
rename to public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java
index 8a03dea44..edbd5ff75 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRodWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODsWalker.java
@@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.qc;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
+import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
+import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@@ -41,23 +44,46 @@ import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.LinkedList;
-import java.util.List;
+import java.util.*;
/**
- * Prints out counts of the number of reference ordered data objects are
- * each locus for debugging RodWalkers.
+ * Prints out counts of the number of reference ordered data objects encountered.
+ *
+ *
+ * Input
+ *
+ * One or more rod files.
+ *
+ *
+ * Output
+ *
+ * Number of rods seen.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountRODs \
+ * -o output.txt \
+ * --rod input.vcf
+ *
+ *
*/
-public class CountRodWalker extends RodWalker, Long>> implements TreeReducible, Long>> {
+public class CountRODsWalker extends RodWalker, Long>> implements TreeReducible, Long>> {
@Output
public PrintStream out;
- @Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
+ /**
+ * One or more input rod files
+ */
+ @Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
+ public List> rods = Collections.emptyList();
+
+ @Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
- @Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
+ @Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
public boolean showSkipped = false;
@Override
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java
index 87c0409b9..9ce9c4eec 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadsWalker.java
@@ -9,8 +9,32 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
/**
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
+ *
+ *
* Can also count the number of reads matching a given criterion using read filters (see the
* --read-filter command line argument). Simplest example of a read-backed analysis.
+ *
+ *
+ *
Input
+ *
+ * One or more BAM files.
+ *
+ *
+ * Output
+ *
+ * Number of reads seen.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T CountReads \
+ * -o output.txt \
+ * -I input.bam \
+ * [-L input.intervals]
+ *
+ *
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CountReadsWalker extends ReadWalker {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java
new file mode 100644
index 000000000..933e24784
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/DocumentationTest.java
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2011, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.qc;
+
+import org.broad.tribble.Feature;
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
+import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.RodWalker;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+import java.util.*;
+
+/**
+ * Summary test
+ *
+ * Body test
+ */
+public class DocumentationTest extends RodWalker {
+ // the docs for the arguments are in the collection
+ @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
+
+ /**
+ * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants.
+ * Other sets can be specified with the -knownName (--known_names) argument.
+ */
+ @ArgumentCollection
+ protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
+
+ /**
+ * detailed documentation about the argument goes here.
+ */
+ @Input(fullName="listofRodBinding", shortName = "disc", doc="Output variants that were not called in this Feature comparison track", required=false)
+ private List> listOfRodBinding = Collections.emptyList();
+
+ @Input(fullName="optionalRodBinding", shortName = "conc", doc="Output variants that were also called in this Feature comparison track", required=false)
+ private RodBinding concordanceTrack;
+
+ @Input(fullName="optionalRodBindingWithoutDefault", shortName = "optionalRodBindingWithoutDefault", doc="Output variants that were also called in this Feature comparison track", required=false)
+ private RodBinding noDefaultOptionalRodBinding;
+
+ @Input(fullName="optionalRodBindingWithoutDefaultNull", shortName = "shortTest", doc="Output variants that were also called in this Feature comparison track", required=false)
+ private RodBinding noDefaultOptionalRodBindingNull = null;
+
+ @Input(fullName="featureArg", shortName = "featureArg", doc="A RodBinding of feature", required=false)
+ private RodBinding featureArg = null;
+
+ @Output(doc="VCFWriter",required=true)
+ protected VCFWriter vcfWriter = null;
+
+ @Advanced
+ @Argument(fullName="setString", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
+ public Set sampleNames;
+
+ @Argument(fullName="setStringInitialized", shortName="setStringInitialized", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
+ public Set setStringInitialized = new HashSet();
+
+ @Argument(shortName="optionalArgWithMissinglessDefault", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false)
+ public ArrayList SELECT_EXPRESSIONS = new ArrayList();
+
+ @Argument(shortName="AAAAA", fullName = "AAAAA", doc="Should be the first argument", required=false)
+ public boolean FIRST_ARG = false;
+
+ @Advanced
+ @Argument(fullName="booleanArg", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
+ private boolean EXCLUDE_NON_VARIANTS = false;
+
+ @Advanced
+ @Argument(fullName="booleanArray", shortName="booleanArray", doc="x", required=false)
+ private boolean[] boolArray = null;
+
+ @Argument(fullName="enumTest", shortName="enumTest", doc="Test enum", required=false)
+ private TestEnum TestEnumArg = TestEnum.ENUM2;
+ public enum TestEnum {
+ /** Docs for enum1 */
+ ENUM1,
+ /** Docs for enum2 */
+ ENUM2
+ }
+
+ @Hidden
+ @Argument(fullName="hiddenArg", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
+ private boolean KEEP_AF_SPECTRUM = false;
+
+ public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; }
+ public Integer reduceInit() { return 0; }
+ public Integer reduce(Integer value, Integer sum) { return value + sum; }
+ public void onTraversalDone(Integer result) { }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
index bd25a73e0..ca30d875b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ValidatingPileupWalker.java
@@ -32,7 +32,7 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature;
+import org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java
index fc6b3daee..9b0824ed0 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesGatherer.java
@@ -1,3 +1,28 @@
+/*
+ * Copyright (c) 2011 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.commandline.Gatherer;
@@ -12,11 +37,8 @@ import java.util.List;
import java.util.regex.Pattern;
/**
- * Created by IntelliJ IDEA.
* User: carneiro
* Date: 3/29/11
- * Time: 3:54 PM
- * To change this template use File | Settings | File Templates.
*/
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
index b4739f366..98c8950e3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CountCovariatesWalker.java
@@ -29,8 +29,8 @@ import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter;
-import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
+import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
+import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.BaseUtils;
@@ -50,27 +50,54 @@ import java.util.List;
import java.util.Map;
/**
- * First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
+ * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
*
- * This walker is designed to work as the first pass in a two-pass processing step.
- * It does a by-locus traversal operating only at sites that are not in dbSNP.
- * We assume that all reference mismatches we see are therefore errors and indicative of poor base quality.
- * This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide)
- * Since there is a large amount of data one can then calculate an empirical probability of error
- * given the particular covariates seen at this site, where p(error) = num mismatches / num observations
- * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score)
- * The first non-comment line of the output file gives the name of the covariates that were used for this calculation.
+ *
+ * This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
+ * only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
+ * of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
+ * reported quality score, cycle, and dinucleotide). Since there is a large amount of data one can then calculate an empirical
+ * probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
+ * The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score).
+ *
+ * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
*
- * Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified
- * Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
+ *
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots.
+ * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
+ *
+ *
Input
+ *
+ * The input read data whose base quality scores need to be assessed.
+ *
+ * A database of known polymorphic sites to skip over.
+ *
+ *
+ * Output
+ *
+ * A recalibration table file in CSV format that is used by the TableRecalibration walker.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ * -R resources/Homo_sapiens_assembly18.fasta \
+ * -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
+ * -knownSites another/optional/setOfSitesToMask.vcf \
+ * -I my_reads.bam \
+ * -T CountCovariates \
+ * -cov ReadGroupCovariate \
+ * -cov QualityScoreCovariate \
+ * -cov CycleCovariate \
+ * -cov DinucCovariate \
+ * -recalFile my_reads.recal_data.csv
+ *
*
- * @author rpoplin
- * @since Nov 3, 2009
*/
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
-@ReadFilters( {MappingQualityZeroReadFilter.class, MappingQualityUnavailableReadFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
+@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
@PartitionBy(PartitionType.LOCUS)
public class CountCovariatesWalker extends LocusWalker implements TreeReducible {
@@ -96,14 +123,23 @@ public class CountCovariatesWalker extends LocusWalker> knownSites = Collections.emptyList();
- @Output
- PrintStream out;
+
+ /**
+ * After the header, data records occur one per line until the end of the file. The first several items on a line are the
+ * values of the individual covariates and will change depending on which covariates were specified at runtime. The last
+ * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
+ * and the raw empirical quality score calculated by phred-scaling the mismatch rate.
+ */
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
@Gather(CountCovariatesGatherer.class)
public PrintStream RECAL_FILE;
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
private boolean LIST_ONLY = false;
+
+ /**
+ * See the -list argument to view available covariates.
+ */
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
private String[] COVARIATES = null;
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
@@ -114,6 +150,10 @@ public class CountCovariatesWalker extends LocusWalker covClass : covariateClasses ) {
- out.println( covClass.getSimpleName() );
+ logger.info( covClass.getSimpleName() );
}
- out.println();
+ logger.info("");
System.exit( 0 ); // Early exit here because user requested it
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
index e6d0b306c..ac25d4f13 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalDataManager.java
@@ -66,15 +66,22 @@ public class RecalDataManager {
private static boolean warnUserNullPlatform = false;
public enum SOLID_RECAL_MODE {
+ /** Treat reference inserted bases as reference matching bases. Very unsafe! */
DO_NOTHING,
+ /** Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. */
SET_Q_ZERO,
+ /** In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. */
SET_Q_ZERO_BASE_N,
+ /** Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. */
REMOVE_REF_BIAS
}
public enum SOLID_NOCALL_STRATEGY {
+ /** When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. */
THROW_EXCEPTION,
+ /** Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. */
LEAVE_READ_UNRECALIBRATED,
+ /** Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. */
PURGE_READ
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
index 0e7f7d111..f31e2fc5b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/RecalibrationArgumentCollection.java
@@ -51,12 +51,27 @@ public class RecalibrationArgumentCollection {
public String FORCE_PLATFORM = null;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
public int WINDOW_SIZE = 5;
+
+ /**
+ * This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
+ */
@Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
public int HOMOPOLYMER_NBACK = 7;
@Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false)
public boolean EXCEPTION_IF_NO_TILE = false;
+
+ /**
+ * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the
+ * reads which have had the reference inserted because of color space inconsistencies.
+ */
@Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO;
+
+ /**
+ * CountCovariates and TableRecalibration accept a --solid_nocall_strategy flag which governs how the recalibrator handles
+ * no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
+ * their color space tag can not be recalibrated.
+ */
@Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false)
public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
index a044abecb..174e810c2 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/TableRecalibrationWalker.java
@@ -52,19 +52,40 @@ import java.util.ResourceBundle;
import java.util.regex.Pattern;
/**
- * Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
+ * Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
*
- * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.
+ *
+ * This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each
+ * base in each read this walker calculates various user-specified covariates (such as read group, reported quality score,
+ * cycle, and dinuc). Using these values as a key in a large hashmap the walker calculates an empirical base quality score
+ * and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads.
*
- * For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
- * Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read.
- * This walker then outputs a new bam file with these updated (recalibrated) reads.
+ *
+ * See the GATK wiki for a tutorial and example recalibration accuracy plots.
+ * http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
*
- * Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker.
- * Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
+ *
Input
+ *
+ * The input read data whose base quality scores need to be recalibrated.
+ *
+ * The recalibration table file in CSV format that was generated by the CountCovariates walker.
+ *
+ *
+ * Output
+ *
+ * A bam file in which the quality scores in each read have been recalibrated.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx4g -jar GenomeAnalysisTK.jar \
+ * -R resources/Homo_sapiens_assembly18.fasta \
+ * -I my_reads.bam \
+ * -T TableRecalibration \
+ * -o my_reads.recal.bam \
+ * -recalFile my_reads.recal_data.csv
+ *
*
- * @author rpoplin
- * @since Nov 3, 2009
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@@ -79,24 +100,54 @@ public class TableRecalibrationWalker extends ReadWalker flag that instructs TableRecalibration to not modify
+ * quality scores less than but rather just write them out unmodified in the recalibrated BAM file. This is useful
+ * because Solexa writes Q2 and Q3 bases when the machine has really gone wrong. This would be fine in and of itself,
+ * but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect,
+ * your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
+ * are unmodified during recalibration, so they don't get inappropriately evaluated.
+ */
+ @Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
private int PRESERVE_QSCORES_LESS_THAN = 5;
- @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1")
+
+ /**
+ * By default TableRecalibration applies a Yates' correction to account for overfitting when it calculates the empirical
+ * quality score, in particular, ( # mismatches + 1 ) / ( # observations + 1 ). TableRecalibration accepts a --smoothing / -sm
+ * argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
+ * --smoothing 15 for a large amount of smoothing.
+ */
+ @Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
private int SMOOTHING = 1;
- @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default=50")
+
+ /**
+ * Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
+ * by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
+ */
+ @Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores")
private int MAX_QUALITY_SCORE = 50;
+
+ /**
+ * By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
+ * the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
+ */
@Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read")
private boolean DO_NOT_WRITE_OQ = false;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
new file mode 100755
index 000000000..2b38afaf6
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/GenotypeAndValidateWalker.java
@@ -0,0 +1,463 @@
+/*
+ * Copyright (c) 2010 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.walkers.validation;
+
+import org.broadinstitute.sting.commandline.*;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
+import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
+
+import java.util.Map;
+import java.util.Set;
+
+import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel;
+
+/**
+ * Genotypes a dataset and validates the calls of another dataset using the Unified Genotyper.
+ *
+ *
+ * Genotype and Validate is a tool to evaluate the quality of a dataset for calling SNPs
+ * and Indels given a secondary (validation) data source. The data sources are BAM or VCF
+ * files. You can use them interchangeably (i.e. a BAM to validate calls in a VCF or a VCF
+ * to validate calls on a BAM).
+ *
+ *
+ *
+ * The simplest scenario is when you have a VCF of hand annotated SNPs and Indels, and you
+ * want to know how well a particular technology performs calling these snps. With a
+ * dataset (BAM file) generated by the technology in test, and the hand annotated VCF, you
+ * can run GenotypeAndValidate to asses the accuracy of the calls with the new technology's
+ * dataset.
+ *
+ *
+ *
+ * Another option is to validate the calls on a VCF file, using a deep coverage BAM file
+ * that you trust the calls on. The GenotypeAndValidate walker will make calls using the
+ * reads in the BAM file and take them as truth, then compare to the calls in the VCF file
+ * and produce a truth table.
+ *
+ *
+ *
+ * Input
+ *
+ * A BAM file to make calls on and a VCF file to use as truth validation dataset.
+ *
+ * You also have the option to invert the roles of the files using the command line options listed below.
+ *
+ *
+ * Output
+ *
+ * GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a
+ * 2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true
+ * positive or a false positive). The table should look like this:
+ *
+ *
+ *
+ *
+ * |
+ * ALT |
+ * REF |
+ * Predictive Value |
+ *
+ *
+ * | called alt |
+ * True Positive (TP) |
+ * False Positive (FP) |
+ * Positive PV |
+ *
+ *
+ * | called ref |
+ * False Negative (FN) |
+ * True Negative (TN) |
+ * Negative PV |
+ *
+ *
+ *
+ *
+ *
+ * The positive predictive value (PPV) is the proportion of subjects with positive test results
+ * who are correctly diagnosed.
+ *
+ *
+ * The negative predictive value (NPV) is the proportion of subjects with a negative test result
+ * who are correctly diagnosed.
+ *
+ *
+ * The VCF file will contain only the variants that were called or not called, excluding the ones that
+ * were uncovered or didn't pass the filters. This file is useful if you are trying to compare
+ * the PPV and NPV of two different technologies on the exact same sites (so you can compare apples to
+ * apples).
+ *
+ *
+ *
+ * Here is an example of an annotated VCF file (info field clipped for clarity)
+ *
+ *
+ * #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
+ * 1 20568807 . C T 0 HapMapHet AC=1;AF=0.50;AN=2;DP=0;GV=T GT 0/1
+ * 1 22359922 . T C 282 WG-CG-HiSeq AC=2;AF=0.50;GV=T;AN=4;DP=42 GT:AD:DP:GL:GQ 1/0 ./. 0/1:20,22:39:-72.79,-11.75,-67.94:99 ./.
+ * 13 102391461 . G A 341 Indel;SnpCluster AC=1;GV=F;AF=0.50;AN=2;DP=45 GT:AD:DP:GL:GQ ./. ./. 0/1:32,13:45:-50.99,-13.56,-112.17:99 ./.
+ * 1 175516757 . C G 655 SnpCluster,WG AC=1;AF=0.50;AN=2;GV=F;DP=74 GT:AD:DP:GL:GQ ./. ./. 0/1:52,22:67:-89.02,-20.20,-191.27:99 ./.
+ *
+ *
+ *
+ *
+ * Additional Details
+ *
+ * -
+ * You should always use -BTI on your VCF track, so that the GATK only looks at the sites on the VCF file.
+ * This speeds up the process a lot.
+ *
+ * -
+ * The total number of visited bases may be greater than the number of variants in the original
+ * VCF file because of extended indels, as they trigger one call per new insertion or deletion.
+ * (i.e. ACTG/- will count as 4 genotyper calls, but it's only one line in the VCF).
+ *
+ *
+ *
+ * Examples
+ *
+ * -
+ * Genotypes BAM file from new technology using the VCF as a truth dataset:
+ *
+ *
+ *
+ * java
+ * -jar /GenomeAnalysisTK.jar
+ * -T GenotypeAndValidate
+ * -R human_g1k_v37.fasta
+ * -I myNewTechReads.bam
+ * -alleles handAnnotatedVCF.vcf
+ * -BTI alleles
+ *
+ *
+ * -
+ * Using a BAM file as the truth dataset:
+ *
+ *
+ *
+ * java
+ * -jar /GenomeAnalysisTK.jar
+ * -T GenotypeAndValidate
+ * -R human_g1k_v37.fasta
+ * -I myTruthDataset.bam
+ * -alleles callsToValidate.vcf
+ * -BTI alleles
+ * -bt
+ * -o gav.vcf
+ *
+ *
+ *
+ * @author Mauricio Carneiro
+ * @since ${DATE}
+ */
+
+@Requires(value={DataSource.READS, DataSource.REFERENCE})
+@Allows(value={DataSource.READS, DataSource.REFERENCE})
+
+@By(DataSource.REFERENCE)
+@Reference(window=@Window(start=-200,stop=200))
+
+
+public class GenotypeAndValidateWalker extends RodWalker implements TreeReducible {
+
+ /**
+ * The optional output file that will have all the variants used in the Genotype and Validation essay.
+ */
+ @Output(doc="Generate a VCF file with the variants considered by the walker, with a new annotation \"callStatus\" which will carry the value called in the validation VCF or BAM file", required=false)
+ protected VCFWriter vcfWriter = null;
+
+ /**
+ * The callset to be used as truth (default) or validated (if BAM file is set to truth).
+ */
+ @Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype", required=true)
+ public RodBinding alleles;
+
+ /**
+ * Makes the Unified Genotyper calls to the BAM file the truth dataset and validates the alleles ROD binding callset.
+ */
+ @Argument(fullName ="set_bam_truth", shortName ="bt", doc="Use the calls on the reads (bam file) as the truth dataset and validate the calls on the VCF", required=false)
+ private boolean bamIsTruth = false;
+
+ /**
+ * The minimum base quality score necessary for a base to be considered when calling a genotype. This argument is passed to the Unified Genotyper.
+ */
+ @Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false)
+ private int mbq = -1;
+
+ /**
+ * The maximum deletion fraction allowed in a site for calling a genotype. This argument is passed to the Unified Genotyper.
+ */
+ @Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false)
+ private double deletions = -1;
+
+ /**
+ * the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. This argument is passed to the Unified Genotyper.
+ */
+ @Argument(fullName="standard_min_confidence_threshold_for_calling", shortName="stand_call_conf", doc="the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls", required=false)
+ private double callConf = -1;
+
+ /**
+ * the minimum phred-scaled Qscore threshold to emit low confidence calls. This argument is passed to the Unified Genotyper.
+ */
+ @Argument(fullName="standard_min_confidence_threshold_for_emitting", shortName="stand_emit_conf", doc="the minimum phred-scaled Qscore threshold to emit low confidence calls", required=false)
+ private double emitConf = -1;
+
+ /**
+ * Only validate sites that have at least a given depth
+ */
+ @Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false)
+ private int minDepth = -1;
+
+ /**
+ * If your VCF or BAM file has more than one sample and you only want to validate one, use this parameter to choose it.
+ */
+ @Hidden
+ @Argument(fullName ="sample", shortName ="sn", doc="Name of the sample to validate (in case your VCF/BAM has more than one sample)", required=false)
+ private String sample = "";
+
+ private UnifiedGenotyperEngine snpEngine;
+ private UnifiedGenotyperEngine indelEngine;
+
+ public static class CountedData {
+ private long nAltCalledAlt = 0L;
+ private long nAltCalledRef = 0L;
+ private long nRefCalledAlt = 0L;
+ private long nRefCalledRef = 0L;
+ private long nNotConfidentCalls = 0L;
+ private long nUncovered = 0L;
+
+ /**
+ * Adds the values of other to this, returning this
+ * @param other the other object
+ */
+ public void add(CountedData other) {
+ nAltCalledAlt += other.nAltCalledAlt;
+ nAltCalledRef += other.nAltCalledRef;
+ nRefCalledAlt += other.nRefCalledAlt;
+ nRefCalledRef += other.nRefCalledRef;
+ nUncovered += other.nUncovered;
+ nNotConfidentCalls += other.nNotConfidentCalls;
+ }
+ }
+
+
+
+ //---------------------------------------------------------------------------------------------------------------
+ //
+ // initialize
+ //
+ //---------------------------------------------------------------------------------------------------------------
+
+ public void initialize() {
+
+ // Initialize VCF header
+ if (vcfWriter != null) {
+ Map header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName());
+ Set samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
+ Set headerLines = VCFUtils.smartMergeHeaders(header.values(), logger);
+ headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate"));
+ vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
+ }
+
+ // Filling in SNP calling arguments for UG
+ UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
+ uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
+ uac.alleles = alleles;
+ if (!bamIsTruth) uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
+ if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq;
+ if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions;
+ if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf;
+ if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf;
+
+ uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
+ snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
+
+ // Adding the INDEL calling arguments for UG
+ uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL;
+ indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
+
+ // make sure we have callConf set to the threshold set by the UAC so we can use it later.
+ callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING;
+ }
+
+ //---------------------------------------------------------------------------------------------------------------
+ //
+ // map
+ //
+ //---------------------------------------------------------------------------------------------------------------
+
+ public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
+
+ final CountedData counter = new CountedData();
+
+ // For some reason RodWalkers get map calls with null trackers
+ if( tracker == null )
+ return counter;
+
+ VariantContext vcComp = tracker.getFirstValue(alleles);
+ if( vcComp == null )
+ return counter;
+
+ //todo - not sure I want this, may be misleading to filter extended indel events.
+ if (isInsideExtendedIndel(vcComp, ref))
+ return counter;
+
+ // Do not operate on variants that are not covered to the optional minimum depth
+ if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) {
+ counter.nUncovered = 1L;
+ return counter;
+ }
+
+ VariantCallContext call;
+ if ( vcComp.isSNP() )
+ call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
+ else if ( vcComp.isIndel() ) {
+ call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
+ }
+ else {
+ logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles());
+ return counter;
+ }
+
+
+ boolean writeVariant = true;
+
+ if (bamIsTruth) {
+ if (call.confidentlyCalled) {
+ // If truth is a confident REF call
+ if (call.isVariant()) {
+ if (vcComp.isVariant())
+ counter.nAltCalledAlt = 1L; // todo -- may wanna check if the alts called are the same?
+ else
+ counter.nAltCalledRef = 1L;
+ }
+ // If truth is a confident ALT call
+ else {
+ if (vcComp.isVariant())
+ counter.nRefCalledAlt = 1L;
+ else
+ counter.nRefCalledRef = 1L;
+ }
+ }
+ else {
+ counter.nNotConfidentCalls = 1L;
+ writeVariant = false;
+ }
+ }
+ else {
+ if (!vcComp.hasAttribute("GV"))
+ throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart());
+
+
+
+ if (call.isCalledAlt(callConf)) {
+ if (vcComp.getAttribute("GV").equals("T"))
+ counter.nAltCalledAlt = 1L;
+ else
+ counter.nRefCalledAlt = 1L;
+ }
+ else if (call.isCalledRef(callConf)) {
+ if (vcComp.getAttribute("GV").equals("T"))
+ counter.nAltCalledRef = 1L;
+ else
+ counter.nRefCalledRef = 1L;
+ }
+ else {
+ counter.nNotConfidentCalls = 1L;
+ writeVariant = false;
+ }
+ }
+
+ if (vcfWriter != null && writeVariant) {
+ if (!vcComp.hasAttribute("callStatus")) {
+ MutableVariantContext mvc = new MutableVariantContext(vcComp);
+ mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" );
+ vcfWriter.add(mvc);
+ }
+ else
+ vcfWriter.add(vcComp);
+ }
+ return counter;
+ }
+
+ //---------------------------------------------------------------------------------------------------------------
+ //
+ // reduce
+ //
+ //---------------------------------------------------------------------------------------------------------------
+
+ public CountedData reduceInit() {
+ return new CountedData();
+ }
+
+ public CountedData treeReduce( final CountedData sum1, final CountedData sum2) {
+ sum2.add(sum1);
+ return sum2;
+ }
+
+ public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) {
+ reduceSum.add(mapValue);
+ return reduceSum;
+ }
+
+ public void onTraversalDone( CountedData reduceSum ) {
+ double ppv = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nRefCalledAlt));
+ double npv = 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nAltCalledRef));
+ double sensitivity = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nAltCalledRef));
+ double specificity = (reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt > 0) ? 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt)) : 100;
+ logger.info(String.format("Resulting Truth Table Output\n\n" +
+ "---------------------------------------------------\n" +
+ "\t\t|\tALT\t|\tREF\t\n" +
+ "---------------------------------------------------\n" +
+ "called alt\t|\t%d\t|\t%d\n" +
+ "called ref\t|\t%d\t|\t%d\n" +
+ "---------------------------------------------------\n" +
+ "positive predictive value: %f%%\n" +
+ "negative predictive value: %f%%\n" +
+ "---------------------------------------------------\n" +
+ "sensitivity: %f%%\n" +
+ "specificity: %f%%\n" +
+ "---------------------------------------------------\n" +
+ "not confident: %d\n" +
+ "not covered: %d\n" +
+ "---------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered));
+ }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
index 7653f511f..f9bd019ea 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java
@@ -14,9 +14,8 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
+import org.broadinstitute.sting.utils.codecs.table.TableFeature;
import org.broadinstitute.sting.gatk.walkers.DataSource;
-import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.BaseUtils;
@@ -31,21 +30,77 @@ import java.util.LinkedList;
import java.util.List;
/**
- * Created by IntelliJ IDEA.
- * User: chartl
- * Date: 6/13/11
- * Time: 2:12 PM
- * To change this template use File | Settings | File Templates.
+ * Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation
+ *
+ *
+ * ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe
+ * sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within
+ * the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies
+ * reasons why the site may fail validation (nearby variation, for example).
+ *
+ *
+ * Input
+ *
+ * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an
+ * interval list defining the size of the amplicons around the sites to be validated
+ *
+ *
+ * Output
+ *
+ * Output is a FASTA-formatted file with some modifications at probe sites. For instance:
+ *
+ * >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414
+ * CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC
+ * >20:792122 Valid 20_792122
+ * TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT
+ * >20:994145 Valid 20_994145
+ * TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC
+ * >20:1074230 SITE_IS_FILTERED=1, 20_1074230
+ * ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC
+ * >20:1084330 DELETION=1, 20_1084330
+ * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
+ *
+ * are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
+ *
+ * Valid // amplicon is valid
+ * SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
+ * VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
+ * MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon
+ * DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate
+ * DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak
+ * START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer
+ * END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
+ * NO_VARIANTS_FOUND, // no variants found within the amplicon region
+ * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
+ *
+ *
+ * Examples
+ *
+ * java
+ * -jar GenomeAnalysisTK.jar
+ * -T ValidationAmplicons
+ * -R /humgen/1kg/reference/human_g1k_v37.fasta
+ * -BTI ProbeIntervals
+ * -ProbeIntervals:table interval_table.table
+ * -ValidateAlleles:vcf sites_to_validate.vcf
+ * -MaskAlleles:vcf mask_sites.vcf
+ * --virtualPrimerSize 30
+ * -o probes.fasta
+ *
+ *
+ * @author chartl
+ * @since July 2011
*/
@Requires(value={DataSource.REFERENCE})
public class ValidationAmplicons extends RodWalker {
- @Input(fullName = "ProbeIntervals", doc="Chris document me", required=true)
+ @Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+
+ "intervals surrounding the probe sites amplicons should be designed for", required=true)
RodBinding probeIntervals;
- @Input(fullName = "ValidateAlleles", doc="Chris document me", required=true)
+ @Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true)
RodBinding validateAlleles;
- @Input(fullName = "MaskAlleles", doc="Chris document me", required=true)
+ @Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true)
RodBinding maskAlleles;
@@ -195,17 +250,17 @@ public class ValidationAmplicons extends RodWalker {
} else /* (mask != null && validate == null ) */ {
if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) {
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
- logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
+ logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isSimpleInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
sequenceInvalid = true;
- invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION");
+ invReason.add(mask.isSimpleInsertion() ? "INSERTION" : "DELETION");
// note: indelCounter could be > 0 (could have small deletion within larger one). This always selects
// the larger event.
- int indelCounterNew = mask.isInsertion() ? 2 : mask.getEnd()-mask.getStart();
+ int indelCounterNew = mask.isSimpleInsertion() ? 2 : mask.getEnd()-mask.getStart();
if ( indelCounterNew > indelCounter ) {
indelCounter = indelCounterNew;
}
//sequence.append((char) ref.getBase());
- //sequence.append(mask.isInsertion() ? 'I' : 'D');
+ //sequence.append(mask.isSimpleInsertion() ? 'I' : 'D');
sequence.append("N");
indelCounter--;
rawSequence.append(Character.toUpperCase((char) ref.getBase()));
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
index 253c6e6d0..613a31ed3 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalWalker.java
@@ -36,25 +36,66 @@ import java.util.*;
/**
* General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more)
+ *
+ *
+ * Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of
+ * raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls
+ * to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics
+ * by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid
+ * degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in
+ * evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation
+ * and stratification modules.
+ *
+ *
Input
+ *
+ * One or more variant sets to evaluate plus any number of comparison sets.
+ *
+ *
+ * Output
+ *
+ * Evaluation tables.
+ *
+ *
+ * Examples
+ *
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ * -R ref.fasta \
+ * -T VariantEval \
+ * -o output.eval.gatkreport \
+ * --eval:set1 set1.vcf \
+ * --eval:set2 set2.vcf \
+ * [--comp comp.vcf]
+ *
+ *
*/
@Reference(window=@Window(start=-50, stop=50))
public class VariantEvalWalker extends RodWalker implements TreeReducible {
- // Output arguments
+
@Output
protected PrintStream out;
+ /**
+ * The variant file(s) to evaluate.
+ */
@Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true)
public List> evals;
+ /**
+ * The variant file(s) to compare against.
+ */
@Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false)
public List> compsProvided = Collections.emptyList();
private List> comps = new ArrayList>();
+ /**
+ * dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants.
+ * Other sets can be specified with the -knownName (--known_names) argument.
+ */
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
// Help arguments
- @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit")
+ @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false)
protected Boolean LIST = false;
// Partitioning the data arguments
@@ -67,8 +108,12 @@ public class VariantEvalWalker extends RodWalker implements Tr
@Argument(fullName="sample", shortName="sn", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false)
protected Set SAMPLE_EXPRESSIONS;
+ /**
+ * List of rod tracks to be used for specifying "known" variants other than dbSNP.
+ */
@Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
- protected String[] KNOWN_NAMES = {};
+ protected HashSet KNOWN_NAMES = new HashSet();
+ List> knowns = new ArrayList>();
// Stratification arguments
@Argument(fullName="stratificationModule", shortName="ST", doc="One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required=false)
@@ -80,7 +125,9 @@ public class VariantEvalWalker extends RodWalker implements Tr
@Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false)
protected Set typesToUse = null;
- // Evaluator arguments
+ /**
+ * See the -list argument to view available modules.
+ */
@Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
protected String[] MODULES_TO_USE = {};
@@ -94,7 +141,10 @@ public class VariantEvalWalker extends RodWalker implements Tr
@Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false)
protected double MIN_PHASE_QUALITY = 10.0;
- @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
+ /**
+ * This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined.
+ */
+ @Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false)
protected String FAMILY_STRUCTURE;
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
@@ -108,9 +158,6 @@ public class VariantEvalWalker extends RodWalker implements Tr
// Variables
private Set jexlExpressions = new TreeSet();
- private Set compNames = new TreeSet();
- private Set knownNames = new TreeSet();
- private Set evalNames = new TreeSet();
private Set sampleNamesForEvaluation = new TreeSet();
private Set sampleNamesForStratification = new TreeSet();
@@ -149,23 +196,24 @@ public class VariantEvalWalker extends RodWalker implements Tr
comps.addAll(compsProvided);
if ( dbsnp.dbsnp.isBound() ) {
comps.add(dbsnp.dbsnp);
- knownNames.add(dbsnp.dbsnp.getName());
+ knowns.add(dbsnp.dbsnp);
}
// Add a dummy comp track if none exists
if ( comps.size() == 0 )
comps.add(new RodBinding(VariantContext.class, "none", "UNBOUND", "", new Tags()));
- // Cache the rod names
- for ( RodBinding compRod : comps )
- compNames.add(compRod.getName());
+ // Set up set of additional knowns
+ for ( RodBinding compRod : comps ) {
+ if ( KNOWN_NAMES.contains(compRod.getName()) )
+ knowns.add(compRod);
+ }
+ // Collect the eval rod names
+ Set evalNames = new TreeSet();
for ( RodBinding evalRod : evals )
evalNames.add(evalRod.getName());
- // Set up set of additional known names
- knownNames.addAll(Arrays.asList(KNOWN_NAMES));
-
// Now that we have all the rods categorized, determine the sample list from the eval rods.
Map vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames);
Set vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
@@ -263,7 +311,8 @@ public class VariantEvalWalker extends RodWalker implements Tr
// for each comp track
for ( final RodBinding compRod : comps ) {
// no sample stratification for comps
- final Set compSet = compVCs.get(compRod) == null ? new HashSet(0) : compVCs.get(compRod).values().iterator().next();
+ final HashMap> compSetHash = compVCs.get(compRod);
+ final Set compSet = (compSetHash == null || compSetHash.size() == 0) ? new HashSet(0) : compVCs.get(compRod).values().iterator().next();
// find the comp
final VariantContext comp = findMatchingComp(eval, compSet);
@@ -462,15 +511,15 @@ public class VariantEvalWalker extends RodWalker implements Tr
public static String getAllSampleName() { return ALL_SAMPLE_NAME; }
- public Set getKnownNames() { return knownNames; }
+ public List> getKnowns() { return knowns; }
- public Set getEvalNames() { return evalNames; }
+ public List> getEvals() { return evals; }
public Set getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }
public Set getSampleNamesForStratification() { return sampleNamesForStratification; }
- public Set getCompNames() { return compNames; }
+ public List> getComps() { return comps; }
public Set getJexlExpressions() { return jexlExpressions; }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
index 87b8bac1d..59ef3d992 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/CountVariants.java
@@ -39,8 +39,10 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
public long nInsertions = 0;
@DataPoint(description = "Number of deletions")
public long nDeletions = 0;
- @DataPoint(description = "Number of complex loci")
+ @DataPoint(description = "Number of complex indels")
public long nComplex = 0;
+ @DataPoint(description = "Number of mixed loci (loci that can't be classified as a SNP, Indel or MNP)")
+ public long nMixed = 0;
@DataPoint(description = "Number of no calls loci")
@@ -97,27 +99,35 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
// This is really not correct. What we really want here is a polymorphic vs. monomorphic count (i.e. on the Genotypes).
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
// added in a proxy check for monomorphic status here.
- if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() == vc1.getNSamples()) ) {
+ // Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
+ if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() + vc1.getNoCallCount() == vc1.getNSamples()) ) {
nRefLoci++;
} else {
- nVariantLoci++;
- switch (vc1.getType()) {
+ switch (vc1.getType()) {
case NO_VARIATION:
break;
case SNP:
+ nVariantLoci++;
nSNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
break;
case MNP:
+ nVariantLoci++;
nMNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
break;
case INDEL:
- if (vc1.isInsertion()) nInsertions++;
- else nDeletions++;
+ nVariantLoci++;
+ if (vc1.isSimpleInsertion())
+ nInsertions++;
+ else if (vc1.isSimpleDeletion())
+ nDeletions++;
+ else
+ nComplex++;
break;
case MIXED:
- nComplex++;
+ nVariantLoci++;
+ nMixed++;
break;
default:
throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType());
@@ -180,8 +190,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
heterozygosity = perLocusRate(nHets);
heterozygosityPerBp = perLocusRInverseRate(nHets);
hetHomRatio = ratio(nHets, nHomVar);
- indelRate = perLocusRate(nDeletions + nInsertions);
- indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions);
+ indelRate = perLocusRate(nDeletions + nInsertions + nComplex);
+ indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions + nComplex);
deletionInsertionRatio = ratio(nDeletions, nInsertions);
}
}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java
index 77def0f30..35fffd815 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelLengthHistogram.java
@@ -96,9 +96,9 @@ public class IndelLengthHistogram extends VariantEvaluator {
}
if ( vc1.isIndel() ) {
- if ( vc1.isInsertion() ) {
+ if ( vc1.isSimpleInsertion() ) {
indelHistogram.update(vc1.getAlternateAllele(0).length());
- } else if ( vc1.isDeletion() ) {
+ } else if ( vc1.isSimpleDeletion() ) {
indelHistogram.update(-vc1.getReference().length());
} else {
throw new ReviewedStingException("Indel type that is not insertion or deletion.");
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java
deleted file mode 100755
index 6e1b76acd..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelMetricsByAC.java
+++ /dev/null
@@ -1,221 +0,0 @@
-package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
-
-import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
-
-import java.util.ArrayList;
-
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * @author delangel
- * @since Apr 11, 2010
- */
-
-@Analysis(name = "Indel Metrics by allele count", description = "Shows various stats binned by allele count")
-public class IndelMetricsByAC extends VariantEvaluator {
- // a mapping from quality score histogram bin to Ti/Tv ratio
- @DataPoint(description = "Indel Metrics by allele count")
- IndelMetricsByAc metrics = null;
-
- int numSamples = 0;
-
- public void initialize(VariantEvalWalker walker) {
- numSamples = walker.getNumSamples();
- }
-
- //@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count")
- //AlleleCountStats alleleCountStats = null;
- private static final int INDEL_SIZE_LIMIT = 100;
- private static final int NUM_SCALAR_COLUMNS = 6;
- static int len2Index(int ind) {
- return ind+INDEL_SIZE_LIMIT;
- }
-
- static int index2len(int ind) {
- return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS;
- }
-
- protected final static String[] METRIC_COLUMNS;
- static {
- METRIC_COLUMNS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1];
- METRIC_COLUMNS[0] = "AC";
- METRIC_COLUMNS[1] = "nIns";
- METRIC_COLUMNS[2] = "nDels";
- METRIC_COLUMNS[3] = "n";
- METRIC_COLUMNS[4] = "nComplex";
- METRIC_COLUMNS[5] = "nLong";
-
- for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++)
- METRIC_COLUMNS[k] = "indel_size_len"+Integer.valueOf(index2len(k));
- }
-
- class IndelMetricsAtAC {
- public int ac = -1, nIns =0, nDel = 0, nComplex = 0, nLong;
- public int sizeCount[] = new int[2*INDEL_SIZE_LIMIT+1];
-
- public IndelMetricsAtAC(int ac) { this.ac = ac; }
-
- public void update(VariantContext eval) {
- int eventLength = 0;
- if ( eval.isInsertion() ) {
- eventLength = eval.getAlternateAllele(0).length();
- nIns++;
- } else if ( eval.isDeletion() ) {
- eventLength = -eval.getReference().length();
- nDel++;
- }
- else {
- nComplex++;
- }
- if (Math.abs(eventLength) < INDEL_SIZE_LIMIT)
- sizeCount[len2Index(eventLength)]++;
- else
- nLong++;
-
-
-
- }
-
- // corresponding to METRIC_COLUMNS
- public String getColumn(int i) {
- if (i >= NUM_SCALAR_COLUMNS && i <=NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT)
- return String.valueOf(sizeCount[i-NUM_SCALAR_COLUMNS]);
-
- switch (i) {
- case 0: return String.valueOf(ac);
- case 1: return String.valueOf(nIns);
- case 2: return String.valueOf(nDel);
- case 3: return String.valueOf(nIns + nDel);
- case 4: return String.valueOf(nComplex);
- case 5: return String.valueOf(nLong);
-
- default:
- throw new ReviewedStingException("Unexpected column " + i);
- }
- }
- }
-
- class IndelMetricsByAc implements TableType {
- ArrayList metrics = new ArrayList();
- Object[] rows = null;
-
- public IndelMetricsByAc( int nchromosomes ) {
- rows = new Object[nchromosomes+1];
- metrics = new ArrayList(nchromosomes+1);
- for ( int i = 0; i < nchromosomes + 1; i++ ) {
- metrics.add(new IndelMetricsAtAC(i));
- rows[i] = "ac" + i;
- }
- }
-
- public Object[] getRowKeys() {
- return rows;
- }
-
- public Object[] getColumnKeys() {
- return METRIC_COLUMNS;
- }
-
- public String getName() {
- return "IndelMetricsByAc";
- }
-
- //
- public String getCell(int ac, int y) {
- return metrics.get(ac).getColumn(y);
- }
-
- public String toString() {
- return "";
- }
-
- public void incrValue( VariantContext eval ) {
- int ac = -1;
-
- if ( eval.hasGenotypes() )
- ac = eval.getChromosomeCount(eval.getAlternateAllele(0));
- else if ( eval.hasAttribute("AC") ) {
- ac = Integer.valueOf(eval.getAttributeAsString("AC"));
- }
-
- if ( ac != -1 )
- metrics.get(ac).update(eval);
- }
- }
-
- //public IndelMetricsByAC(VariantEvalWalker parent) {
- //super(parent);
- // don't do anything
- //}
-
- public String getName() {
- return "IndelMetricsByAC";
- }
-
- public int getComparisonOrder() {
- return 1; // we only need to see each eval track
- }
-
- public boolean enabled() {
- return true;
- }
-
- public String toString() {
- return getName();
- }
-
- public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
- final String interesting = null;
-
- if (eval != null ) {
- if ( metrics == null ) {
- int nSamples = numSamples;
- //int nSamples = 2;
- if ( nSamples != -1 )
- metrics = new IndelMetricsByAc(2 * nSamples);
- }
-
- if ( eval.isIndel() && eval.isBiallelic() &&
- metrics != null ) {
- metrics.incrValue(eval);
- }
- }
-
- return interesting; // This module doesn't capture any interesting sites, so return null
- }
-
- //public void finalizeEvaluation() {
- //
- //}
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java
index d99196ecf..fc347339d 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/IndelStatistics.java
@@ -44,7 +44,7 @@ public class IndelStatistics extends VariantEvaluator {
@DataPoint(description = "Indel Statistics")
IndelStats indelStats = null;
- @DataPoint(description = "Indel Classification")
+ // @DataPoint(description = "Indel Classification")
IndelClasses indelClasses = null;
int numSamples = 0;
@@ -57,13 +57,13 @@ public class IndelStatistics extends VariantEvaluator {
private static final int IND_HET = 0;
private static final int IND_INS = 1;
private static final int IND_DEL = 2;
- private static final int IND_AT_CG_RATIO = 3;
+ private static final int IND_COMPLEX = 3;
private static final int IND_HET_INS = 4;
private static final int IND_HOM_INS = 5;
private static final int IND_HET_DEL = 6;
private static final int IND_HOM_DEL = 7;
private static final int IND_HOM_REF = 8;
- private static final int IND_COMPLEX = 9;
+ private static final int IND_MIXED = 9;
private static final int IND_LONG = 10;
private static final int IND_AT_EXP = 11;
private static final int IND_CG_EXP = 12;
@@ -79,15 +79,14 @@ public class IndelStatistics extends VariantEvaluator {
}
static class IndelStats implements TableType {
- protected final static String ALL_SAMPLES_KEY = "allSamples";
- protected final static String[] COLUMN_KEYS;
+ protected final static String[] COLUMN_KEYS;
static {
COLUMN_KEYS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1];
COLUMN_KEYS[0] = "heterozygosity";
COLUMN_KEYS[1] = "insertions";
COLUMN_KEYS[2] = "deletions";
- COLUMN_KEYS[3] = "AT_CG_expansion_ratio";
+ COLUMN_KEYS[3] = "complex";
COLUMN_KEYS[4] = "het_insertions";
COLUMN_KEYS[5] = "homozygous_insertions";
COLUMN_KEYS[6] = "het_deletions";
@@ -104,13 +103,10 @@ public class IndelStatistics extends VariantEvaluator {
}
// map of sample to statistics
- protected final HashMap indelSummary = new HashMap();
+ protected final int[] indelSummary;
public IndelStats(final VariantContext vc) {
- indelSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]);
- for( final String sample : vc.getGenotypes().keySet() ) {
- indelSummary.put(sample, new int[COLUMN_KEYS.length]);
- }
+ indelSummary = new int[COLUMN_KEYS.length];
}
/**
@@ -118,19 +114,10 @@ public class IndelStatistics extends VariantEvaluator {
* @return one row per sample
*/
public Object[] getRowKeys() {
- return indelSummary.keySet().toArray(new String[indelSummary.size()]);
+ return new String[]{"all"};
}
public Object getCell(int x, int y) {
- final Object[] rowKeys = getRowKeys();
- if (y == IND_AT_CG_RATIO) {
-
- int at = indelSummary.get(rowKeys[x])[IND_AT_EXP];
- int cg = indelSummary.get(rowKeys[x])[IND_CG_EXP];
- return String.format("%4.2f",((double)at) / (Math.max(cg, 1)));
- }
- else
- return String.format("%d",indelSummary.get(rowKeys[x])[y]);
-
+ return String.format("%d",indelSummary[y]);
}
/**
@@ -160,96 +147,49 @@ public class IndelStatistics extends VariantEvaluator {
int eventLength = 0;
boolean isInsertion = false, isDeletion = false;
- if ( vc.isInsertion() ) {
+ if ( vc.isSimpleInsertion() ) {
eventLength = vc.getAlternateAllele(0).length();
- indelSummary.get(ALL_SAMPLES_KEY)[IND_INS]++;
+ indelSummary[IND_INS]++;
isInsertion = true;
- } else if ( vc.isDeletion() ) {
- indelSummary.get(ALL_SAMPLES_KEY)[IND_DEL]++;
+ } else if ( vc.isSimpleDeletion() ) {
+ indelSummary[IND_DEL]++;
eventLength = -vc.getReference().length();
isDeletion = true;
}
- else {
- indelSummary.get(ALL_SAMPLES_KEY)[IND_COMPLEX]++;
+ else if (vc.isComplexIndel()) {
+ indelSummary[IND_COMPLEX]++;
}
+ else if (vc.isMixed())
+ indelSummary[IND_MIXED]++;
+
if (IndelUtils.isATExpansion(vc,ref))
- indelSummary.get(ALL_SAMPLES_KEY)[IND_AT_EXP]++;
+ indelSummary[IND_AT_EXP]++;
if (IndelUtils.isCGExpansion(vc,ref))
- indelSummary.get(ALL_SAMPLES_KEY)[IND_CG_EXP]++;
+ indelSummary[IND_CG_EXP]++;
// make sure event doesn't overstep array boundaries
- if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) {
- indelSummary.get(ALL_SAMPLES_KEY)[len2Index(eventLength)]++;
- if (eventLength % 3 != 0)
- indelSummary.get(ALL_SAMPLES_KEY)[IND_FRAMESHIFT]++;
- }
- else
- indelSummary.get(ALL_SAMPLES_KEY)[IND_LONG]++;
-
-
- for( final String sample : vc.getGenotypes().keySet() ) {
- if ( indelSummary.containsKey(sample) ) {
- Genotype g = vc.getGenotype(sample);
- boolean isVariant = (g.isCalled() && !g.isHomRef());
- if (isVariant) {
- // update ins/del count
- if (isInsertion) {
- indelSummary.get(sample)[IND_INS]++;
- }
- else if (isDeletion)
- indelSummary.get(sample)[IND_DEL]++;
- else
- indelSummary.get(sample)[IND_COMPLEX]++;
-
- // update histogram
- if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) {
- indelSummary.get(sample)[len2Index(eventLength)]++;
- if (eventLength % 3 != 0)
- indelSummary.get(sample)[IND_FRAMESHIFT]++;
- }
- else
- indelSummary.get(sample)[IND_LONG]++;
-
- if (g.isHet())
- if (isInsertion)
- indelSummary.get(sample)[IND_HET_INS]++;
- else if (isDeletion)
- indelSummary.get(sample)[IND_HET_DEL]++;
- else
- if (isInsertion)
- indelSummary.get(sample)[IND_HOM_INS]++;
- else if (isDeletion)
- indelSummary.get(sample)[IND_HOM_DEL]++;
-
- if (IndelUtils.isATExpansion(vc,ref))
- indelSummary.get(sample)[IND_AT_EXP]++;
- if (IndelUtils.isCGExpansion(vc,ref))
- indelSummary.get(sample)[IND_CG_EXP]++;
-
-
- }
- else
- indelSummary.get(sample)[IND_HOM_REF]++;
+ if (vc.isSimpleDeletion() || vc.isSimpleInsertion()) {
+ if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) {
+ indelSummary[len2Index(eventLength)]++;
+ if (eventLength % 3 != 0)
+ indelSummary[IND_FRAMESHIFT]++;
}
+ else
+ indelSummary[IND_LONG]++;
}
-
}
}
static class IndelClasses implements TableType {
- protected final static String ALL_SAMPLES_KEY = "allSamples";
protected final static String[] columnNames = IndelUtils.getIndelClassificationNames();
// map of sample to statistics
- protected final HashMap indelClassSummary = new HashMap();
+ protected final int[] indelClassSummary;
public IndelClasses(final VariantContext vc) {
- indelClassSummary.put(ALL_SAMPLES_KEY, new int[columnNames.length]);
- for( final String sample : vc.getGenotypes().keySet() ) {
- indelClassSummary.put(sample, new int[columnNames.length]);
- }
+ indelClassSummary = new int[columnNames.length];
}
/**
@@ -257,11 +197,10 @@ public class IndelStatistics extends VariantEvaluator {
* @return one row per sample
*/
public Object[] getRowKeys() {
- return indelClassSummary.keySet().toArray(new String[indelClassSummary.size()]);
+ return new String[]{"all"};
}
public Object getCell(int x, int y) {
- final Object[] rowKeys = getRowKeys();
- return String.format("%d",indelClassSummary.get(rowKeys[x])[y]);
+ return String.format("%d",indelClassSummary[y]);
}
/**
@@ -285,18 +224,7 @@ public class IndelStatistics extends VariantEvaluator {
}
private void incrementSampleStat(VariantContext vc, int index) {
- indelClassSummary.get(ALL_SAMPLES_KEY)[index]++;
- for( final String sample : vc.getGenotypes().keySet() ) {
- if ( indelClassSummary.containsKey(sample) ) {
- Genotype g = vc.getGenotype(sample);
- boolean isVariant = (g.isCalled() && !g.isHomRef());
- if (isVariant)
- // update count
- indelClassSummary.get(sample)[index]++;
-
- }
- }
-
+ indelClassSummary[index]++;
}
/*
* increment the specified value
@@ -344,16 +272,13 @@ public class IndelStatistics extends VariantEvaluator {
if (eval != null ) {
if ( indelStats == null ) {
- int nSamples = numSamples;
-
- if ( nSamples != -1 )
- indelStats = new IndelStats(eval);
+ indelStats = new IndelStats(eval);
}
if ( indelClasses == null ) {
indelClasses = new IndelClasses(eval);
}
- if ( eval.isIndel() && eval.isBiallelic() ) {
+ if ( eval.isIndel() || eval.isMixed() ) {
if (indelStats != null )
indelStats.incrValue(eval, ref);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java
index 411493d4f..5cdea4e00 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java
@@ -1,23 +1,25 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
+import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
-import java.util.Set;
+import java.util.List;
public class AlleleCount extends VariantStratifier {
// needs to know the variant context
private ArrayList states = new ArrayList();
@Override
- public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) {
+ public void initialize() {
+ List> evals = getVariantEvalWalker().getEvals();
+
// we can only work with a single eval VCF, and it must have genotypes
- if ( evalNames.size() != 1 )
+ if ( evals.size() != 1 )
throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf");
// There are 2 x n sample chromosomes for diploids
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java
index 2ffc7716c..96d9f30ec 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleFrequency.java
@@ -2,19 +2,17 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
-import java.util.Set;
public class AlleleFrequency extends VariantStratifier {
// needs to know the variant context
private ArrayList states;
@Override
- public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) {
+ public void initialize() {
states = new ArrayList();
for( double a = 0.000; a <= 1.005; a += 0.005 ) {
states.add(String.format("%.3f", a));
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java
index c6975808f..9f4123589 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CompRod.java
@@ -1,24 +1,20 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
+import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
-import java.util.Set;
public class CompRod extends VariantStratifier implements RequiredStratification {
- // Needs to know the comp rods
- private Set compNames;
private ArrayList states;
@Override
- public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) {
- this.compNames = compNames;
-
+ public void initialize() {
states = new ArrayList();
- states.addAll(compNames);
+ for ( RodBinding rod : getVariantEvalWalker().getComps() )
+ states.add(rod.getName());
}
public ArrayList getAllStates() {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java
index c14355035..e12a1ba97 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Contig.java
@@ -2,20 +2,18 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
-import java.util.Set;
public class Contig extends VariantStratifier {
// needs to know the variant context
private ArrayList states;
@Override
- public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) {
+ public void initialize() {
states = new ArrayList();
- states.addAll(contigNames);
+ states.addAll(getVariantEvalWalker().getContigNames());
states.add("all");
}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java
index e1f2ae983..ff49c8ba9 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/CpG.java
@@ -2,11 +2,9 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
-import java.util.Set;
/**
* CpG is a stratification module for VariantEval that divides the input data by within/not within a CpG site
@@ -24,7 +22,7 @@ public class CpG extends VariantStratifier {
private ArrayList states;
@Override
- public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) {
+ public void initialize() {
states = new ArrayList();
states.add("all");
states.add("CpG");
@@ -40,7 +38,7 @@ public class CpG extends VariantStratifier {
if (ref != null && ref.getBases() != null) {
String fwRefBases = new String(ref.getBases());
- String leftFlank = fwRefBases.substring((fwRefBases.length()/2) - 1, (fwRefBases.length()/2) + 1);
+ //String leftFlank = fwRefBases.substring((fwRefBases.length()/2) - 1, (fwRefBases.length()/2) + 1);
String rightFlank = fwRefBases.substring((fwRefBases.length()/2), (fwRefBases.length()/2) + 2);
//if (leftFlank.equalsIgnoreCase("CG") || leftFlank.equalsIgnoreCase("GC") || rightFlank.equalsIgnoreCase("CG") || rightFlank.equalsIgnoreCase("GC")) {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java
index 155a66186..cc878e975 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/Degeneracy.java
@@ -2,13 +2,11 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
-import java.util.Set;
public class Degeneracy extends VariantStratifier {
private ArrayList states;
@@ -16,7 +14,7 @@ public class Degeneracy extends VariantStratifier {
private HashMap> degeneracies;
@Override
- public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set evalNames, Set sampleNames, Set contigNames) {
+ public void initialize() {
states = new ArrayList();
states.add("1-fold");
states.add("2-fold");
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java
index 40f952fd2..0bfecee25 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/EvalRod.java
@@ -1,24 +1,20 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
+import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
-import java.util.Set;
public class EvalRod extends VariantStratifier implements RequiredStratification {
- // needs to know the eval rods
- private Set evalNames;
private ArrayList states;
@Override
- public void initialize(Set jexlExpressions, Set compNames, Set knownNames, Set