Merging ReduceReads development with unstable repo

It is time to bring the ReadClipper class to the main repo. Read Clipper has tested functionality for soft and hard clipping reads. I will prepare thorough documentation for it as it will be very useful for the assembler and the GATK in general.
This commit is contained in:
Mauricio Carneiro 2011-08-22 23:03:03 -04:00
commit feeab6075f
179 changed files with 7551 additions and 2541 deletions

View File

@ -49,7 +49,7 @@
<!-- Contracts for Java -->
<!-- To disable, run with -Duse.contracts=false -->
<property name="use.contracts" value="true" />
<property name="use.contracts" value="false" />
<property name="java.contracts" value="${build.dir}/java/contracts" />
<property name="contracts.version" value="1.0-20110609" />
<property name="cofoja.jar" value="${lib.dir}/cofoja-${contracts.version}.jar"/>
@ -489,7 +489,7 @@
docletpathref="doclet.classpath"
classpathref="external.dependencies"
classpath="${java.classes}"
additionalparam="-private -build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -quiet -J-Xdebug -J-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005">
additionalparam="-private -build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -quiet -J-Xdebug -J-Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=5005"> <!-- -test to only do DocumentationTest walker -->
<sourcefiles>
<union>
<fileset refid="all.java.source.files"/>

View File

@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.text.XReadLines;
import java.io.*;
@ -42,19 +43,71 @@ import java.util.Map;
import java.util.regex.Pattern;
/**
* Created by IntelliJ IDEA.
* User: rpoplin
* Date: Dec 1, 2009
* Call R scripts to plot residual error versus the various covariates.
*
* Create collapsed versions of the recal csv file and call R scripts to plot residual error versus the various covariates.
* <p>
* After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which
* reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically
* show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities).
* In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run
* CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated
* by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual
* error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated
* bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration.
*
* <p>
* The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into
* each of the quality score estimates. It is defined as follows for N, the number of observations:
*
* <ul>
* <li>light blue means N < 1,000</li>
* <li>cornflower blue means 1,000 <= N < 10,000</li>
* <li>dark blue means N >= 10,000</li>
* <li>The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically
* meaningless and so aren't included in any of the numerical calculations.</li>
* </ul>
*
* <p>
* NOTE: For those running this tool externally from the Broad, it is crucial to note that both the -Rscript and -resources options
* must be changed from the default. -Rscript needs to point to your installation of Rscript (this is the scripting version of R,
* not the interactive version) while -resources needs to point to the folder holding the R scripts that are used. For those using
* this tool as part of the Binary Distribution the -resources should point to the resources folder that is part of the tarball.
* For those using this tool by building from the git repository the -resources should point to the R/ subdirectory of the Sting checkout.
*
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* The recalibration table file in CSV format that was generated by the CountCovariates walker.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar AnalyzeCovariates.jar \
* -recalFile /path/to/recal.table.csv \
* -outputDir /path/to/output_dir/ \
* -resources resources/ \
* -ignoreQ 5
* </pre>
*
*/
@DocumentedGATKFeature(
groupName = "AnalyzeCovariates",
summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
public class AnalyzeCovariates extends CommandLineProgram {
/////////////////////////////
// Command Line Arguments
/////////////////////////////
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/
@Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false)
private String RECAL_FILE = "output.recal_data.csv";
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
@ -67,11 +120,20 @@ public class AnalyzeCovariates extends CommandLineProgram {
private int IGNORE_QSCORES_LESS_THAN = 5;
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
/**
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
*/
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 50")
private int MAX_QUALITY_SCORE = 50;
/**
* This argument is useful for comparing before/after plots and you want the axes to match each other.
*/
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
private int MAX_HISTOGRAM_VALUE = 0;
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, this value will be the max value of the histogram plots")
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting")
private boolean DO_INDEL_QUALITY = false;

View File

@ -0,0 +1,4 @@
/**
* Package to plot residual accuracy versus error covariates for the base quality score recalibrator.
*/
package org.broadinstitute.sting.analyzecovariates;

View File

@ -0,0 +1,40 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.commandline;
import java.lang.annotation.*;
/**
* Indicates that a walker argument should is considered an advanced option.
*
* @author Mark DePristo
* @version 0.1
*/
@Documented
@Inherited
@Retention(RetentionPolicy.RUNTIME)
@Target({ElementType.TYPE,ElementType.FIELD})
public @interface Advanced {
}

View File

@ -151,6 +151,14 @@ public class ArgumentSource {
return field.isAnnotationPresent(Hidden.class) || field.isAnnotationPresent(Deprecated.class);
}
/**
* Is the given argument considered an advanced option when displaying on the command-line argument system.
* @return True if so. False otherwise.
*/
public boolean isAdvanced() {
return field.isAnnotationPresent(Advanced.class);
}
/**
* Is this command-line argument dependent on some primitive argument types?
* @return True if this command-line argument depends on other arguments; false otherwise.

View File

@ -325,7 +325,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
@Override
public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) {
Class parameterType = getParameterizedTypeClass(type);
Class parameterType = JVMUtils.getParameterizedTypeClass(type);
return RodBinding.makeUnbound((Class<? extends Feature>)parameterType);
}
@ -338,6 +338,8 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) {
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
String value = getArgumentValue( defaultDefinition, matches );
Class<? extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type);
try {
String name = defaultDefinition.fullName;
String tribbleType = null;
@ -372,19 +374,19 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file);
if ( featureDescriptor != null ) {
tribbleType = featureDescriptor.getName();
logger.warn("Dynamically determined type of " + file + " to be " + tribbleType);
logger.info("Dynamically determined type of " + file + " to be " + tribbleType);
}
}
if ( tribbleType == null )
throw new UserException.CommandLineException(
String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
"Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
manager.userFriendlyListOfAvailableFeatures(parameterType)));
}
}
if ( tribbleType == null ) // error handling
throw new UserException.CommandLineException(
String.format("Could not parse argument %s with value %s",
defaultDefinition.fullName, value));
Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
Class parameterType = getParameterizedTypeClass(type);
RodBinding result = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags);
parsingEngine.addTags(result,tags);
parsingEngine.addRodBinding(result);
@ -395,20 +397,10 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
value, source.field.getName()));
} catch (Exception e) {
throw new UserException.CommandLineException(
String.format("Failed to parse value %s for argument %s.",
value, source.field.getName()));
String.format("Failed to parse value %s for argument %s. Message: %s",
value, source.field.getName(), e.getMessage()));
}
}
private Class getParameterizedTypeClass(Type t) {
if ( t instanceof ParameterizedType ) {
ParameterizedType parameterizedType = (ParameterizedType)t;
if ( parameterizedType.getActualTypeArguments().length != 1 )
throw new ReviewedStingException("BUG: more than 1 generic type found on class" + t);
return (Class)parameterizedType.getActualTypeArguments()[0];
} else
throw new ReviewedStingException("BUG: could not find generic type on class " + t);
}
}
/**

View File

@ -55,7 +55,7 @@ public @interface Output {
* --help argument is specified.
* @return Doc string associated with this command-line argument.
*/
String doc() default "An output file presented to the walker. Will overwrite contents if file exists.";
String doc() default "An output file created by the walker. Will overwrite contents if file exists";
/**
* Is this argument required. If true, the command-line argument system will

View File

@ -96,24 +96,23 @@ public abstract class CommandLineExecutable extends CommandLineProgram {
loadArgumentsIntoObject(walker);
argumentSources.add(walker);
Collection<RMDTriplet> newStyle = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
Collection<RMDTriplet> rodBindings = ListFileUtils.unpackRODBindings(parser.getRodBindings(), parser);
// todo: remove me when the old style system is removed
if ( getArgumentCollection().RODBindings.size() > 0 ) {
logger.warn("################################################################################");
logger.warn("################################################################################");
logger.warn("Deprecated -B rod binding syntax detected. This syntax will be retired in GATK 1.2.");
logger.warn("Deprecated -B rod binding syntax detected. This syntax has been eliminated in GATK 1.2.");
logger.warn("Please use arguments defined by each specific walker instead.");
for ( String oldStyleRodBinding : getArgumentCollection().RODBindings ) {
logger.warn(" -B rod binding with value " + oldStyleRodBinding + " tags: " + parser.getTags(oldStyleRodBinding).getPositionalTags());
}
logger.warn("################################################################################");
logger.warn("################################################################################");
System.exit(1);
}
Collection<RMDTriplet> oldStyle = ListFileUtils.unpackRODBindingsOldStyle(getArgumentCollection().RODBindings, parser);
oldStyle.addAll(newStyle);
engine.setReferenceMetaDataFiles(oldStyle);
engine.setReferenceMetaDataFiles(rodBindings);
for (ReadFilter filter: filters) {
loadArgumentsIntoObject(filter);

View File

@ -31,13 +31,11 @@ import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.CommandLineProgram;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
import org.broadinstitute.sting.gatk.walkers.Attribution;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.help.ApplicationDetails;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.help.GATKDocUtils;
import org.broadinstitute.sting.utils.help.GATKDoclet;
import org.broadinstitute.sting.utils.help.*;
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import java.util.*;
@ -52,7 +50,7 @@ import java.util.*;
@DocumentedGATKFeature(
groupName = "GATK Engine",
summary = "Features and arguments for the GATK engine itself, available to all walkers.",
extraDocs = { ReadFilter.class, UserException.class })
extraDocs = { UserException.class })
public class CommandLineGATK extends CommandLineExecutable {
@Argument(fullName = "analysis_type", shortName = "T", doc = "Type of analysis to run")
private String analysisName = null;
@ -177,6 +175,10 @@ public class CommandLineGATK extends CommandLineExecutable {
StringBuilder additionalHelp = new StringBuilder();
Formatter formatter = new Formatter(additionalHelp);
formatter.format("Available Reference Ordered Data types:%n");
formatter.format(new FeatureManager().userFriendlyListOfAvailableFeatures());
formatter.format("%n");
formatter.format("For a full description of this walker, see its GATKdocs at:%n");
formatter.format("%s%n", GATKDocUtils.helpLinksToGATKDocs(walkerType));

View File

@ -689,8 +689,6 @@ public class GenomeAnalysisEngine {
validateSuppliedReads();
readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference());
sampleDataSource = new SampleDataSource(getSAMFileHeader(), argCollection.sampleFiles);
for (ReadFilter filter : filters)
filter.initialize(this);
@ -963,7 +961,7 @@ public class GenomeAnalysisEngine {
/**
* Get the list of intervals passed to the engine.
* @return List of intervals.
* @return List of intervals, or null if no intervals are in use
*/
public GenomeLocSortedSet getIntervals() {
return this.intervals;

View File

@ -39,8 +39,7 @@ import org.simpleframework.xml.*;
public class DbsnpArgumentCollection {
/**
* A dbSNP VCF file. Variants in this track will be treated as "known" variants
* in tools using this track.
* A dbSNP VCF file.
*/
@Input(fullName="dbsnp", shortName = "D", doc="dbSNP file", required=false)
public RodBinding<VariantContext> dbsnp;

View File

@ -101,6 +101,8 @@ public class GATKArgumentCollection {
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
public File referenceFile = null;
@Deprecated
@Hidden
@ElementList(required = false)
@Input(fullName = "rodBind", shortName = "B", doc = "Bindings for reference-ordered data, in the form :<name>,<type> <file>", required = false)
public ArrayList<String> RODBindings = new ArrayList<String>();
@ -340,14 +342,6 @@ public class GATKArgumentCollection {
return false;
}
}
if (other.RODBindings.size() != RODBindings.size()) {
return false;
}
for (int x = 0; x < RODBindings.size(); x++) {
if (!RODBindings.get(x).equals(other.RODBindings.get(x))) {
return false;
}
}
if (!other.samFiles.equals(this.samFiles)) {
return false;
}

View File

@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord;
* Filter out FailsVendorQualityCheck reads.
*/
public class FailsVendorQualityCheckReadFilter extends ReadFilter {
public class FailsVendorQualityCheckFilter extends ReadFilter {
public boolean filterOut( final SAMRecord read ) {
return read.getReadFailsVendorQualityCheckFlag();
}

View File

@ -35,7 +35,7 @@ import org.broadinstitute.sting.commandline.Argument;
* @version 0.1
*/
public class MappingQualityReadFilter extends ReadFilter {
public class MappingQualityFilter extends ReadFilter {
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for calling", required = false)
public int MIN_MAPPING_QUALTY_SCORE = 10;

View File

@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.QualityUtils;
* @version 0.1
*/
public class MappingQualityUnavailableReadFilter extends ReadFilter {
public class MappingQualityUnavailableFilter extends ReadFilter {
public boolean filterOut(SAMRecord rec) {
return (rec.getMappingQuality() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE);
}

View File

@ -33,7 +33,7 @@ import net.sf.samtools.SAMRecord;
* @version 0.1
*/
public class MappingQualityZeroReadFilter extends ReadFilter {
public class MappingQualityZeroFilter extends ReadFilter {
public boolean filterOut(SAMRecord rec) {
return (rec.getMappingQuality() == 0);
}

View File

@ -34,7 +34,7 @@ import net.sf.samtools.SAMRecord;
* Filter out duplicate reads.
*/
public class NotPrimaryAlignmentReadFilter extends ReadFilter {
public class NotPrimaryAlignmentFilter extends ReadFilter {
public boolean filterOut( final SAMRecord read ) {
return read.getNotPrimaryAlignmentFlag();
}

View File

@ -333,10 +333,6 @@ public class RefMetaDataTracker {
return addValues(name, type, new ArrayList<T>(), getTrackDataByName(name), onlyAtThisLoc, true, false);
}
@Deprecated
public <T extends Feature> List<T> getValues(final Class<T> type, final Collection<String> names, final GenomeLoc onlyAtThisLoc) {
return addValues(names, type, new ArrayList<T>(), onlyAtThisLoc, true, false);
}
@Deprecated
public <T extends Feature> T getFirstValue(final Class<T> type, final String name) {
return safeGetFirst(getValues(type, name));
}

View File

@ -1,10 +1,11 @@
package org.broadinstitute.sting.gatk.refdata;
import net.sf.samtools.util.SequenceUtil;
import org.broad.tribble.Feature;
import org.broad.tribble.annotation.Strand;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broad.tribble.gelitext.GeliTextFeature;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
@ -92,6 +93,67 @@ public class VariantContextAdaptors {
// --------------------------------------------------------------------------------------------------------------
private static class DBSnpAdaptor implements VCAdaptor {
private static boolean isSNP(DbSNPFeature feature) {
return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact");
}
private static boolean isMNP(DbSNPFeature feature) {
return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range");
}
private static boolean isInsertion(DbSNPFeature feature) {
return feature.getVariantType().contains("insertion");
}
private static boolean isDeletion(DbSNPFeature feature) {
return feature.getVariantType().contains("deletion");
}
private static boolean isIndel(DbSNPFeature feature) {
return isInsertion(feature) || isDeletion(feature) || isComplexIndel(feature);
}
public static boolean isComplexIndel(DbSNPFeature feature) {
return feature.getVariantType().contains("in-del");
}
/**
* gets the alternate alleles. This method should return all the alleles present at the location,
* NOT including the reference base. This is returned as a string list with no guarantee ordering
* of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
* frequency).
*
* @return an alternate allele list
*/
public static List<String> getAlternateAlleleList(DbSNPFeature feature) {
List<String> ret = new ArrayList<String>();
for (String allele : getAlleleList(feature))
if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele);
return ret;
}
/**
* gets the alleles. This method should return all the alleles present at the location,
* including the reference base. The first allele should always be the reference allele, followed
* by an unordered list of alternate alleles.
*
* @return an alternate allele list
*/
public static List<String> getAlleleList(DbSNPFeature feature) {
List<String> alleleList = new ArrayList<String>();
// add ref first
if ( feature.getStrand() == Strand.POSITIVE )
alleleList = Arrays.asList(feature.getObserved());
else
for (String str : feature.getObserved())
alleleList.add(SequenceUtil.reverseComplement(str));
if ( alleleList.size() > 0 && alleleList.contains(feature.getNCBIRefBase())
&& !alleleList.get(0).equals(feature.getNCBIRefBase()) )
Collections.swap(alleleList, alleleList.indexOf(feature.getNCBIRefBase()), 0);
return alleleList;
}
/**
* Converts non-VCF formatted dbSNP records to VariantContext.
* @return DbSNPFeature.
@ -102,18 +164,18 @@ public class VariantContextAdaptors {
@Override
public VariantContext convert(String name, Object input, ReferenceContext ref) {
DbSNPFeature dbsnp = (DbSNPFeature)input;
if ( ! Allele.acceptableAlleleBases(DbSNPHelper.getReference(dbsnp)) )
if ( ! Allele.acceptableAlleleBases(dbsnp.getNCBIRefBase()) )
return null;
Allele refAllele = Allele.create(DbSNPHelper.getReference(dbsnp), true);
Allele refAllele = Allele.create(dbsnp.getNCBIRefBase(), true);
if ( DbSNPHelper.isSNP(dbsnp) || DbSNPHelper.isIndel(dbsnp) || DbSNPHelper.isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
if ( isSNP(dbsnp) || isIndel(dbsnp) || isMNP(dbsnp) || dbsnp.getVariantType().contains("mixed") ) {
// add the reference allele
List<Allele> alleles = new ArrayList<Allele>();
alleles.add(refAllele);
// add all of the alt alleles
boolean sawNullAllele = refAllele.isNull();
for ( String alt : DbSNPHelper.getAlternateAlleleList(dbsnp) ) {
for ( String alt : getAlternateAlleleList(dbsnp) ) {
if ( ! Allele.acceptableAlleleBases(alt) ) {
//System.out.printf("Excluding dbsnp record %s%n", dbsnp);
return null;

View File

@ -1,169 +0,0 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.refdata.features;
import net.sf.samtools.util.SequenceUtil;
import org.broad.tribble.annotation.Strand;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
/**
* this class contains static helper methods for DbSNP
*/
public class DbSNPHelper {
private DbSNPHelper() {} // don't make a DbSNPHelper
public static String rsIDOfFirstRealVariant(List<VariantContext> VCs, VariantContext.Type type) {
if ( VCs == null )
return null;
String rsID = null;
for ( VariantContext vc : VCs ) {
if ( vc.getType() == type ) {
rsID = vc.getID();
break;
}
}
return rsID;
}
/**
* get the -1 * (log 10 of the error value)
*
* @return the log based error estimate
*/
public static double getNegLog10PError(DbSNPFeature feature) {
return 4; // -log10(0.0001)
}
//
// What kind of variant are we?
//
// ----------------------------------------------------------------------
public static boolean isSNP(DbSNPFeature feature) {
return feature.getVariantType().contains("single") && feature.getLocationType().contains("exact");
}
public static boolean isMNP(DbSNPFeature feature) {
return feature.getVariantType().contains("mnp") && feature.getLocationType().contains("range");
}
public static String toMediumString(DbSNPFeature feature) {
String s = String.format("%s:%d:%s:%s", feature.getChr(), feature.getStart(), feature.getRsID(), Utils.join("",feature.getObserved()));
if (isSNP(feature)) s += ":SNP";
if (isIndel(feature)) s += ":Indel";
if (isHapmap(feature)) s += ":Hapmap";
if (is2Hit2Allele(feature)) s += ":2Hit";
return s;
}
public static boolean isInsertion(DbSNPFeature feature) {
return feature.getVariantType().contains("insertion");
}
public static boolean isDeletion(DbSNPFeature feature) {
return feature.getVariantType().contains("deletion");
}
public static boolean isIndel(DbSNPFeature feature) {
return DbSNPHelper.isInsertion(feature) || DbSNPHelper.isDeletion(feature) || DbSNPHelper.isComplexIndel(feature);
}
public static boolean isComplexIndel(DbSNPFeature feature) {
return feature.getVariantType().contains("in-del");
}
public static boolean isHapmap(DbSNPFeature feature) {
return feature.getValidationStatus().contains("by-hapmap");
}
public static boolean is2Hit2Allele(DbSNPFeature feature) {
return feature.getValidationStatus().contains("by-2hit-2allele");
}
public static boolean is1000genomes(DbSNPFeature feature) {
return feature.getValidationStatus().contains("by-1000genomes");
}
public static boolean isMQ1(DbSNPFeature feature) {
return feature.getWeight() == 1;
}
/**
* gets the alternate alleles. This method should return all the alleles present at the location,
* NOT including the reference base. This is returned as a string list with no guarantee ordering
* of alleles (i.e. the first alternate allele is not always going to be the allele with the greatest
* frequency).
*
* @return an alternate allele list
*/
public static List<String> getAlternateAlleleList(DbSNPFeature feature) {
List<String> ret = new ArrayList<String>();
for (String allele : getAlleleList(feature))
if (!allele.equals(String.valueOf(feature.getNCBIRefBase()))) ret.add(allele);
return ret;
}
public static boolean onFwdStrand(DbSNPFeature feature) {
return feature.getStrand() == Strand.POSITIVE;
}
public static String getReference(DbSNPFeature feature) {
return feature.getNCBIRefBase();
}
public static String toSimpleString(DbSNPFeature feature) {
return String.format("%s:%s:%s", feature.getRsID(), feature.getObserved(), (feature.getStrand() == Strand.POSITIVE) ? "+" : "-");
}
/**
* gets the alleles. This method should return all the alleles present at the location,
* including the reference base. The first allele should always be the reference allele, followed
* by an unordered list of alternate alleles.
*
* @return an alternate allele list
*/
public static List<String> getAlleleList(DbSNPFeature feature) {
List<String> alleleList = new ArrayList<String>();
// add ref first
if ( onFwdStrand(feature) )
alleleList = Arrays.asList(feature.getObserved());
else
for (String str : feature.getObserved())
alleleList.add(SequenceUtil.reverseComplement(str));
if ( alleleList.size() > 0 && alleleList.contains(getReference(feature)) && !alleleList.get(0).equals(getReference(feature)) )
Collections.swap(alleleList, alleleList.indexOf(getReference(feature)), 0);
return alleleList;
}
}

View File

@ -36,7 +36,10 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.help.GATKDocUtils;
import org.broadinstitute.sting.utils.help.HelpUtils;
import javax.mail.Header;
import java.io.File;
import java.util.*;
@ -50,7 +53,7 @@ import java.util.*;
* @author depristo
*/
public class FeatureManager {
public static class FeatureDescriptor {
public static class FeatureDescriptor implements Comparable<FeatureDescriptor> {
final String name;
final FeatureCodec codec;
@ -62,6 +65,7 @@ public class FeatureManager {
public String getName() {
return name;
}
public String getSimpleFeatureName() { return getFeatureClass().getSimpleName(); }
public FeatureCodec getCodec() {
return codec;
}
@ -70,13 +74,18 @@ public class FeatureManager {
@Override
public String toString() {
return String.format("FeatureDescriptor name=%s codec=%s feature=%s", getName(), getCodecClass().getName(), getFeatureClass().getName());
return String.format("FeatureDescriptor name=%s codec=%s feature=%s",
getName(), getCodecClass().getName(), getFeatureClass().getName());
}
@Override
public int compareTo(FeatureDescriptor o) {
return getName().compareTo(o.getName());
}
}
private final PluginManager<FeatureCodec> pluginManager;
private final Collection<FeatureDescriptor> featureDescriptors = new HashSet<FeatureDescriptor>();
private final Collection<FeatureDescriptor> featureDescriptors = new TreeSet<FeatureDescriptor>();
/**
* Construct a FeatureManager
@ -114,7 +123,7 @@ public class FeatureManager {
*/
@Requires("featureClass != null")
public <T extends Feature> Collection<FeatureDescriptor> getByFeature(Class<T> featureClass) {
Set<FeatureDescriptor> consistentDescriptors = new HashSet<FeatureDescriptor>();
Set<FeatureDescriptor> consistentDescriptors = new TreeSet<FeatureDescriptor>();
if (featureClass == null)
throw new IllegalArgumentException("trackRecordType value is null, please pass in an actual class object");
@ -189,10 +198,40 @@ public class FeatureManager {
*/
@Ensures("result != null")
public String userFriendlyListOfAvailableFeatures() {
List<String> names = new ArrayList<String>();
for ( final FeatureDescriptor descriptor : featureDescriptors )
names.add(descriptor.getName());
return Utils.join(",", names);
return userFriendlyListOfAvailableFeatures(Feature.class);
}
/**
* Returns a list of the available tribble track names (vcf,dbsnp,etc) that we can load
* restricted to only Codecs producting Features consistent with the requiredFeatureType
* @return
*/
@Ensures("result != null")
public String userFriendlyListOfAvailableFeatures(Class<? extends Feature> requiredFeatureType) {
final String nameHeader="Name", featureHeader = "FeatureType", docHeader="Documentation";
int maxNameLen = nameHeader.length(), maxFeatureNameLen = featureHeader.length();
for ( final FeatureDescriptor descriptor : featureDescriptors ) {
if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) {
maxNameLen = Math.max(maxNameLen, descriptor.getName().length());
maxFeatureNameLen = Math.max(maxFeatureNameLen, descriptor.getSimpleFeatureName().length());
}
}
StringBuilder docs = new StringBuilder();
String format = "%" + maxNameLen + "s %" + maxFeatureNameLen + "s %s%n";
docs.append(String.format(format, nameHeader, featureHeader, docHeader));
for ( final FeatureDescriptor descriptor : featureDescriptors ) {
if ( requiredFeatureType.isAssignableFrom(descriptor.getFeatureClass()) ) {
String oneDoc = String.format(format,
descriptor.getName(),
descriptor.getSimpleFeatureName(),
GATKDocUtils.helpLinksToGATKDocs(descriptor.getCodecClass()));
docs.append(oneDoc);
}
}
return docs.toString();
}
/**

View File

@ -30,7 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
@ -42,7 +44,6 @@ import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation;
import org.broadinstitute.sting.utils.clipreads.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.yaml.snakeyaml.events.SequenceStartEvent;
import java.io.File;
import java.io.PrintStream;
@ -51,44 +52,158 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This ReadWalker provides simple, yet powerful read clipping capabilities. It allows the user to clip bases in reads
* with poor quality scores, that match particular sequences, or that were generated by particular machine cycles.
* This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.
*
*
* <p>
* It allows the user to clip bases in reads with poor quality scores, that match particular
* sequences, or that were generated by particular machine cycles.
*
* <dl>
* <dt>Quality score based clipping</dt>
* <dd>
* Clip bases from the read in clipper from
* <br>argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)</br>
* to the end of the read. This is blatantly stolen from BWA.
*
* Walk through the read from the end (in machine cycle order) to the beginning, calculating the
* running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this
* sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the
* clipping index in the read (from the end).
* </dd>
* <dt>Cycle based clipping</dt>
* <dd>Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
* For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions).
* For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.
* </dd>
* <dt>Sequence matching</dt>
* <dd>Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm,
* filtering only bases whose sequence exactly matches SEQ.</dd>
* </dl>
*
* </p>
*
* <h2>Input</h2>
* <p>
* Any number of BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* A new BAM file containing all of the reads from the input BAMs with the user-specified clipping
* operation applied to each read.
* </p>
* <p>
* <h3>Summary output</h3>
* <pre>
* Number of examined reads 13
* Number of clipped reads 13
* Percent of clipped reads 100.00
* Number of examined bases 988
* Number of clipped bases 126
* Percent of clipped bases 12.75
* Number of quality-score clipped bases 126
* Number of range clipped bases 0
* Number of sequence clipped bases 0
* </pre>
* </p>
*
* <p>
* <h3>Example clipping</h3>
* Suppose we are given this read:
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
*
* If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:
*
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
* NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
*
* Whereas with -CR WRITE_Q0S:
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
*
* Or -CR SOFTCLIP_BASES:
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3133 29 17S59M * * *
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
* </p>
*
* <h2>Examples</h2>
* <pre>
* -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
* -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
* </pre>
* @author Mark DePristo
* @since 2010
*/
@Requires({DataSource.READS})
public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithData, ClipReadsWalker.ClippingData> {
@Output
PrintStream out;
/**
* If provided, ClipReads will write summary statistics about the clipping operations applied
* to the reads to this file.
*/
@Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false)
PrintStream out = null;
/**
* an optional argument to dump the reads out to a BAM file
* The output SAM/BAM file will be written here
*/
@Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false)
StingSAMFileWriter outputBam = null;
@Output(doc = "Write BAM output here", required = true)
StingSAMFileWriter outputBam;
@Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false)
/**
* If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this
* quality score threshold.
*/
@Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "If provided, the Q-score clipper will be applied", required = false)
int qTrimmingThreshold = -1;
@Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false)
/**
* Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
* For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based
* values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11,
* and 12.
*/
@Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String indicating machine cycles to clip from the reads", required = false)
String cyclesToClipArg = null;
@Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false)
/**
* Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the
* sequences in the file.
*/
@Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching the sequences in this FASTA file", required = false)
String clipSequenceFile = null;
/**
* Clips bases from the reads matching the provided SEQ. Can be provided any number of times on the command line
*/
@Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false)
String[] clipSequencesArgs = null;
@Argument(fullName="read", doc="", required=false)
String onlyDoRead = null;
//@Argument(fullName = "keepCompletelyClipped", shortName = "KCC", doc = "Unfortunately, sometimes a read is completely clipped away but with SOFTCLIP_BASES this results in an invalid CIGAR string. ", required = false)
//boolean keepCompletelyClippedReads = false;
// @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false)
// boolean onlyClipFirstSeqMatch = false;
/**
* The different values for this argument determines how ClipReads applies clips to the reads. This can range
* from writing Ns over the clipped bases to hard clipping away the bases from the BAM.
*/
@Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false)
ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS;
@Hidden
@Advanced
@Argument(fullName="read", doc="", required=false)
String onlyDoRead = null;
/**
* List of sequence that should be clipped from the reads

View File

@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -17,7 +17,7 @@ import java.util.Set;
* To change this template use File | Settings | File Templates.
*/
@Requires({DataSource.READS,DataSource.REFERENCE})
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentReadFilter.class})
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class})
public abstract class DuplicateWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?
public boolean filter(GenomeLoc loc, AlignmentContext context, Set<List<SAMRecord>> readSets ) {

View File

@ -3,8 +3,8 @@ package org.broadinstitute.sting.gatk.walkers;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckReadFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentReadFilter;
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -18,7 +18,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@By(DataSource.READS)
@Requires({DataSource.READS,DataSource.REFERENCE, DataSource.REFERENCE_BASES})
@PartitionBy(PartitionType.INTERVAL)
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentReadFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckReadFilter.class})
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
public abstract class LocusWalker<MapType, ReduceType> extends Walker<MapType, ReduceType> {
// Do we actually want to operate on the context?
public boolean filter(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {

View File

@ -26,7 +26,6 @@
package org.broadinstitute.sting.gatk.walkers;
import org.broad.tribble.Feature;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
@ -34,9 +33,6 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;

View File

@ -40,26 +40,65 @@ import java.util.TreeSet;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
/**
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear
* in the input file. It can dynamically merge the contents of multiple input BAM files, resulting
* in merged output sorted in coordinate order. Can also optionally filter reads based on the --read-filter
* command line argument.
* Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file.
*
* <p>
* PrintReads can dynamically merge the contents of multiple input BAM files, resulting
* in merged output sorted in coordinate order. Can also optionally filter reads based on the
* --read_filter command line argument.
*
* <h2>Input</h2>
* <p>
* One or more bam files.
* </p>
*
* <h2>Output</h2>
* <p>
* A single processed bam file.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T PrintReads \
* -o output.bam \
* -I input1.bam \
* -I input2.bam \
* --read_filter MappingQualityZero
* </pre>
*
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@Requires({DataSource.READS, DataSource.REFERENCE})
public class PrintReadsWalker extends ReadWalker<SAMRecord, SAMFileWriter> {
/** an optional argument to dump the reads out to a BAM file */
@Output(doc="Write output to this BAM filename instead of STDOUT")
SAMFileWriter out;
@Argument(fullName = "readGroup", shortName = "readGroup", doc="Exclude all reads with this read group from the output", required = false)
String readGroup = null;
/**
* For example, --platform ILLUMINA or --platform 454.
*/
@Argument(fullName = "platform", shortName = "platform", doc="Exclude all reads with this platform from the output", required = false)
String platform = null; // E.g. ILLUMINA, 454
String platform = null;
@Argument(fullName = "number", shortName = "n", doc="Print the first n reads from the file, discarding the rest", required = false)
int nReadsToPrint = -1;
/**
* Only reads from samples listed in the provided file(s) will be included in the output.
*/
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
public Set<File> sampleFile = new TreeSet<File>();
/**
* Only reads from the sample(s) will be included in the output.
*/
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
public Set<String> sampleNames = new TreeSet<String>();

View File

@ -90,7 +90,7 @@ public class AlleleBalance extends InfoFieldAnnotation {
}
// todo -- actually care about indel length from the pileup (agnostic at the moment)
int refCount = indelPileup.size();
int altCount = vc.isInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions();
int altCount = vc.isSimpleInsertion() ? indelPileup.getNumberOfInsertions() : indelPileup.getNumberOfDeletions();
if ( refCount + altCount == 0 ) {
continue;

View File

@ -79,7 +79,7 @@ public class HomopolymerRun extends InfoFieldAnnotation implements StandardAnnot
GenomeLoc locus = ref.getLocus();
GenomeLoc window = ref.getWindow();
int refBasePos = (int) (locus.getStart() - window.getStart())+1;
if ( vc.isDeletion() ) {
if ( vc.isSimpleDeletion() ) {
// check that deleted bases are the same
byte dBase = bases[refBasePos];
for ( int i = 0; i < vc.getReference().length(); i ++ ) {

View File

@ -36,9 +36,9 @@ public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnota
if (!vc.isBiallelic())
type = "MULTIALLELIC_INDEL";
else {
if (vc.isInsertion())
if (vc.isSimpleInsertion())
type = "INS.";
else if (vc.isDeletion())
else if (vc.isSimpleDeletion())
type = "DEL.";
else
type = "OTHER.";

View File

@ -161,19 +161,19 @@ public class SnpEff extends InfoFieldAnnotation implements ExperimentalAnnotatio
public List<VCFInfoHeaderLine> getDescriptions() {
return Arrays.asList(
new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID"),
new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name"),
new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID"),
new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID"),
new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank"),
new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If present, gene is non-coding"),
new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "One of the most high-impact effects across all transcripts at this site"),
new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the effect " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the effect"),
new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid"),
new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon"),
new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number"),
new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size")
new VCFInfoHeaderLine(GENE_ID_KEY, 1, VCFHeaderLineType.String, "Gene ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(GENE_NAME_KEY, 1, VCFHeaderLineType.String, "Gene name for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(TRANSCRIPT_ID_KEY, 1, VCFHeaderLineType.String, "Transcript ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(EXON_ID_KEY, 1, VCFHeaderLineType.String, "Exon ID for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(EXON_RANK_KEY, 1, VCFHeaderLineType.Integer, "Exon rank for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(WITHIN_NON_CODING_GENE_KEY, 0, VCFHeaderLineType.Flag, "If this flag is present, the highest-impact effect resulting from the current variant is within a non-coding gene"),
new VCFInfoHeaderLine(EFFECT_KEY, 1, VCFHeaderLineType.String, "The highest-impact effect resulting from the current variant (or one of the highest-impact effects, if there is a tie)"),
new VCFInfoHeaderLine(EFFECT_IMPACT_KEY, 1, VCFHeaderLineType.String, "Impact of the highest-impact effect resulting from the current variant " + Arrays.toString(SnpEffConstants.EffectImpact.values())),
new VCFInfoHeaderLine(EFFECT_EXTRA_INFORMATION_KEY, 1, VCFHeaderLineType.String, "Additional information about the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(OLD_NEW_AA_KEY, 1, VCFHeaderLineType.String, "Old/New amino acid for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(OLD_NEW_CODON_KEY, 1, VCFHeaderLineType.String, "Old/New codon for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(CODON_NUM_KEY, 1, VCFHeaderLineType.Integer, "Codon number for the highest-impact effect resulting from the current variant"),
new VCFInfoHeaderLine(CDS_SIZE_KEY, 1, VCFHeaderLineType.Integer, "CDS size for the highest-impact effect resulting from the current variant")
);
}
}

View File

@ -49,7 +49,34 @@ import java.util.*;
/**
* Annotates variant calls with context information. Users can specify which of the available annotations to use.
* Annotates variant calls with context information.
*
* <p>
* VariantAnnotator is a GATK tool for annotating variant calls based on their context.
* The tool is modular; new annotations can be written easily without modifying VariantAnnotator itself.
*
* <h2>Input</h2>
* <p>
* A variant set to annotate and optionally one or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* An annotated VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantAnnotator \
* -I input.bam \
* -o output.vcf \
* -A DepthOfCoverage
* --variant input.vcf \
* --dbsnp dbsnp.vcf
* </pre>
*
*/
@Requires(value={})
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@ -69,8 +96,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
public RodBinding<SnpEffFeature> getSnpEffRodBinding() { return snpEffFile; }
/**
* A dbSNP VCF file from which to annotate.
*
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
*/
@ArgumentCollection
@ -101,15 +126,25 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@Argument(fullName="sampleName", shortName="sample", doc="The sample (NA-ID) corresponding to the variant input (for non-VCF input only)", required=false)
protected String sampleName = null;
/**
* See the -list argument to view available annotations.
*/
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected List<String> annotationsToUse = new ArrayList<String>();
/**
* See the -list argument to view available groups.
*/
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected List<String> annotationGroupsToUse = new ArrayList<String>();
/**
* This option enables you to add annotations from one VCF to another.
*
* For example, if you want to annotate your 'variant' VCF with the AC field value from the rod bound to 'resource',
* you can specify '-E resource.AC' and records in the output VCF will be annotated with 'resource.AC=N' when a record exists in that rod at the given position.
* If multiple records in the rod overlap the given position, one is chosen arbitrarily.
*/
@Argument(fullName="expression", shortName="E", doc="One or more specific expressions to apply to variant calls; see documentation for more details", required=false)
protected List<String> expressionsToUse = new ArrayList<String>();
@ -127,8 +162,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
@Argument(fullName="vcfContainsOnlyIndels", shortName="dels",doc="Use if you are annotating an indel vcf, currently VERY experimental", required = false)
protected boolean indelsOnly = false;
private HashMap<String, String> nonVCFsampleName = new HashMap<String, String>();
private VariantAnnotatorEngine engine;
private Collection<VariantContext> indelBufferContext;
@ -164,12 +197,6 @@ public class VariantAnnotator extends RodWalker<Integer, Integer> implements Ann
List<String> rodName = Arrays.asList(variantCollection.variants.getName());
Set<String> samples = SampleUtils.getUniqueSamplesFromRods(getToolkit(), rodName);
// add the non-VCF sample from the command-line, if applicable
if ( sampleName != null ) {
nonVCFsampleName.put(sampleName.toUpperCase(), "variant");
samples.add(sampleName.toUpperCase());
}
// if there are no valid samples, warn the user
if ( samples.size() == 0 ) {
logger.warn("There are no samples input at all; use the --sampleName argument to specify one if desired.");

View File

@ -29,15 +29,11 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotationInterfaceManager;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
@ -158,7 +154,7 @@ public class VariantAnnotatorEngine {
private void annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map<String, Object> infoAnnotations) {
for ( Map.Entry<RodBinding<VariantContext>, String> dbSet : dbAnnotations.entrySet() ) {
if ( dbSet.getValue().equals(VCFConstants.DBSNP_KEY) ) {
String rsID = DbSNPHelper.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
String rsID = VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbSet.getKey(), ref.getLocus()), vc.getType());
infoAnnotations.put(VCFConstants.DBSNP_KEY, rsID != null);
// annotate dbsnp id if available and not already there
if ( rsID != null && (!vc.hasID() || vc.getID().equals(VCFConstants.EMPTY_ID_FIELD)) )

View File

@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature;
import org.broadinstitute.sting.utils.codecs.beagle.BeagleFeature;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.SampleUtils;
@ -48,6 +48,31 @@ import static java.lang.Math.log10;
/**
* Takes files produced by Beagle imputation engine and creates a vcf with modified annotations.
*
* <p>This walker is intended to be run after Beagle has successfully executed. The full calling sequence for using Beagle along with the GATK is: </p>
*
* <p>1. Run ProduceBeagleInputWalker. </p>
* <p>2. Run Beagle</p>
* <p>3. Uncompress output files</p>
* <p>4. Run BeagleOutputToVCFWalker.</p>
*
*
* Note that this walker requires all input files produced by Beagle.
*
*
* <h2>Example</h2>
* <pre>
* java -Xmx4000m -jar dist/GenomeAnalysisTK.jar \
* -R reffile.fasta -T BeagleOutputToVCF \
* -V input_vcf.vcf \
* -beagleR2:BEAGLE /myrun.beagle_output.r2 \
* -beaglePhased:BEAGLE /myrun.beagle_output.phased \
* -beagleProbs:BEAGLE /myrun.beagle_output.gprobs \
* -o output_vcf.vcf
* </pre>
<p> Note that Beagle produces some of these files compressed as .gz, so gunzip must be run on them before walker is run in order to decompress them </p>
*/
public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
@ -57,22 +82,18 @@ public class BeagleOutputToVCFWalker extends RodWalker<Integer, Integer> {
@Input(fullName="comp", shortName = "comp", doc="Comparison VCF file", required=false)
public RodBinding<VariantContext> comp;
@Input(fullName="beagleR2", shortName = "beagleR2", doc="VCF file", required=true)
@Input(fullName="beagleR2", shortName = "beagleR2", doc="Beagle-produced .r2 file containing R^2 values for all markers", required=true)
public RodBinding<BeagleFeature> beagleR2;
@Input(fullName="beagleProbs", shortName = "beagleProbs", doc="VCF file", required=true)
@Input(fullName="beagleProbs", shortName = "beagleProbs", doc="Beagle-produced .probs file containing posterior genotype probabilities", required=true)
public RodBinding<BeagleFeature> beagleProbs;
@Input(fullName="beaglePhased", shortName = "beaglePhased", doc="VCF file", required=true)
@Input(fullName="beaglePhased", shortName = "beaglePhased", doc="Beagle-produced .phased file containing phased genotypes", required=true)
public RodBinding<BeagleFeature> beaglePhased;
@Output(doc="File to which variants should be written",required=true)
@Output(doc="VCF File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@Argument(fullName="output_file", shortName="output", doc="Please use --out instead" ,required=false)
@Deprecated
protected String oldOutputArg;
@Argument(fullName="dont_mark_monomorphic_sites_as_filtered", shortName="keep_monomorphic", doc="If provided, we won't filter sites that beagle tags as monomorphic. Useful for imputing a sample's genotypes from a reference panel" ,required=false)
public boolean DONT_FILTER_MONOMORPHIC_SITES = false;

View File

@ -48,19 +48,45 @@ import java.io.PrintStream;
import java.util.*;
/**
* Produces an input file to Beagle imputation engine, listing genotype likelihoods for each sample in input variant file
* Converts the input VCF into a format accepted by the Beagle imputation/analysis program.
* <p>
*
* <h2>Input</h2>
* <p>
* A VCF with variants to convert to Beagle format
* </p>
*
* <h2>Outputs</h2>
* <p>
* A single text file which can be fed to Beagle
* </p>
* <p>
* Optional: A file with a list of markers
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar dist/GenomeAnalysisTK.jar -L 20 \
* -R reffile.fasta -T ProduceBeagleInput \
* -V path_to_input_vcf/inputvcf.vcf -o path_to_beagle_output/beagle_output
* </pre>
*
*/
public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@Input(fullName="validation", shortName = "validation", doc="Input VCF file", required=false)
@Hidden
@Input(fullName="validation", shortName = "validation", doc="Validation VCF file", required=false)
public RodBinding<VariantContext> validation;
@Output(doc="File to which BEAGLE input should be written",required=true)
protected PrintStream beagleWriter = null;
@Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false)
@Hidden
@Output(doc="File to which BEAGLE markers should be written", shortName="markers", fullName = "markers", required = false)
protected PrintStream markers = null;
int markerCounter = 1;
@ -73,14 +99,19 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
@Argument(doc="VQSqual key", shortName = "vqskey", required=false)
protected String VQSLOD_KEY = "VQSqual";
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
@Hidden
@Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
public double insertedNoCallRate = 0;
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
@Hidden
@Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
public double validationPrior = -1.0;
@Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
@Hidden
@Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
public double bootstrap = 0.0;
@Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
@Hidden
@Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
VCFWriter bootstrapVCFOutput = null;
@Argument(fullName = "checkIsMaleOnChrX", shortName = "checkIsMaleOnChrX", doc = "Set to true when Beagle-ing chrX and want to ensure male samples don't have heterozygous calls.", required = false)
public boolean CHECK_IS_MALE_ON_CHR_X = false;

View File

@ -22,6 +22,7 @@
package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -42,50 +43,195 @@ import java.io.PrintStream;
/**
* Emits a data file containing information about callable, uncallable, poorly mapped, and other parts of the genome
*
* @Author depristo
* @Date May 7, 2010
* <p>
* A very common question about a NGS set of reads is what areas of the genome are considered callable. The system
* considers the coverage at each locus and emits either a per base state or a summary interval BED file that
* partitions the genomic intervals into the following callable states:
* <dl>
* <dt>REF_N</dt>
* <dd>the reference base was an N, which is not considered callable the GATK</dd>
* <dt>CALLABLE</dt>
* <dd>the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE</dd>
* <dt>NO_COVERAGE</dt>
* <dd>absolutely no reads were seen at this locus, regardless of the filtering parameters</dd>
* <dt>LOW_COVERAGE</dt>
* <dd>there were less than min. depth bases at the locus, after applying filters</dd>
* <dt>EXCESSIVE_COVERAGE</dt>
* <dd>more than -maxDepth read at the locus, indicating some sort of mapping problem</dd>
* <dt>POOR_MAPPING_QUALITY</dt>
* <dd>more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads</dd>
* </dl>
* </p>
*
* <h2>Input</h2>
* <p>
* A BAM file containing <b>exactly one sample</b>.
* </p>
*
* <h2>Output</h2>
* <p>
* <ul>
* <li>-o: a OutputFormatted (recommended BED) file with the callable status covering each base</li>
* <li>-summary: a table of callable status x count of all examined bases</li>
* </ul>
* </p>
*
* <h2>Examples</h2>
* <pre>
* -T CallableLociWalker \
* -I my.bam \
* -summary my.summary \
* -o my.bed
* </pre>
*
* would produce a BED file (my.bed) that looks like:
*
* <pre>
* 20 10000000 10000864 CALLABLE
* 20 10000865 10000985 POOR_MAPPING_QUALITY
* 20 10000986 10001138 CALLABLE
* 20 10001139 10001254 POOR_MAPPING_QUALITY
* 20 10001255 10012255 CALLABLE
* 20 10012256 10012259 POOR_MAPPING_QUALITY
* 20 10012260 10012263 CALLABLE
* 20 10012264 10012328 POOR_MAPPING_QUALITY
* 20 10012329 10012550 CALLABLE
* 20 10012551 10012551 LOW_COVERAGE
* 20 10012552 10012554 CALLABLE
* 20 10012555 10012557 LOW_COVERAGE
* 20 10012558 10012558 CALLABLE
* et cetera...
* </pre>
* as well as a summary table that looks like:
*
* <pre>
* state nBases
* REF_N 0
* CALLABLE 996046
* NO_COVERAGE 121
* LOW_COVERAGE 928
* EXCESSIVE_COVERAGE 0
* POOR_MAPPING_QUALITY 2906
* </pre>
*
* @author Mark DePristo
* @since May 7, 2010
*/
@By(DataSource.REFERENCE)
public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableBaseState, CallableLociWalker.Integrator> {
@Output
PrintStream out;
@Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read. The gap between this value and mmq are reads that are not sufficiently well mapped for calling but aren't indicative of mapping problems.", required = false)
/**
* Callable loci summary counts (see outputs) will be written to this file.
*/
@Output(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
File summaryFile;
/**
* The gap between this value and mmq are reads that are not sufficiently well mapped for calling but
* aren't indicative of mapping problems. For example, if maxLowMAPQ = 1 and mmq = 20, then reads with
* MAPQ == 0 are poorly mapped, MAPQ >= 20 are considered as contributing to calling, where
* reads with MAPQ >= 1 and < 20 are not bad in and of themselves but aren't sufficiently good to contribute to
* calling. In effect this reads are invisible, driving the base to the NO_ or LOW_COVERAGE states
*/
@Argument(fullName = "maxLowMAPQ", shortName = "mlmq", doc = "Maximum value for MAPQ to be considered a problematic mapped read.", required = false)
byte maxLowMAPQ = 1;
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth. Defaults to 50.", required = false)
/**
* Reads with MAPQ > minMappingQuality are treated as usable for variation detection, contributing to the CALLABLE
* state.
*/
@Argument(fullName = "minMappingQuality", shortName = "mmq", doc = "Minimum mapping quality of reads to count towards depth.", required = false)
byte minMappingQuality = 10;
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth. Defaults to 20.", required = false)
/**
* Bases with less than minBaseQuality are viewed as not sufficiently high quality to contribute to the CALLABLE state
*/
@Argument(fullName = "minBaseQuality", shortName = "mbq", doc = "Minimum quality of bases to count towards depth.", required = false)
byte minBaseQuality = 20;
/**
* If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
* value and is less than maxDepth the site is considered CALLABLE.
*/
@Advanced
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
int minDepth = 4;
/**
* If the QC+ depth exceeds this value the site is considered to have EXCESSIVE_DEPTH
*/
@Argument(fullName = "maxDepth", shortName = "maxDepth", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
int maxDepth = -1;
/**
* We don't want to consider a site as POOR_MAPPING_QUALITY just because it has two reads, and one is MAPQ. We
* won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads
* covering the site.
*/
@Advanced
@Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false)
int minDepthLowMAPQ = 10;
@Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "Maximum read depth before a locus is considered poorly mapped", required = false)
/**
* If the number of reads at this site is greater than minDepthForLowMAPQ and the fraction of reads with low mapping quality
* exceeds this fraction then the site has POOR_MAPPING_QUALITY.
*/
@Argument(fullName = "maxFractionOfReadsWithLowMAPQ", shortName = "frlmq", doc = "If the fraction of reads at a base with low mapping quality exceeds this value, the site may be poorly mapped", required = false)
double maxLowMAPQFraction = 0.1;
@Argument(fullName = "format", shortName = "format", doc = "Output format for the system: either BED or STATE_PER_BASE", required = false)
/**
* The output of this walker will be written in this format. The recommended option is BED.
*/
@Advanced
@Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
OutputFormat outputFormat;
@Argument(fullName = "summary", shortName = "summary", doc = "Name of file for output summary", required = true)
File summaryFile;
public enum OutputFormat {
/**
* The output will be written as a BED file. There's a BED element for each
* continuous run of callable states (i.e., CALLABLE, REF_N, etc). This is the recommended
* format
*/
BED,
public enum OutputFormat { BED, STATE_PER_BASE }
/**
* Emit chr start stop state quads for each base. Produces a potentially disasterously
* large amount of output.
*/
STATE_PER_BASE
}
public enum CalledState {
/** the reference base was an N, which is not considered callable the GATK */
REF_N,
/** the base satisfied the min. depth for calling but had less than maxDepth to avoid having EXCESSIVE_COVERAGE */
CALLABLE,
/** absolutely no reads were seen at this locus, regardless of the filtering parameters */
NO_COVERAGE,
/** there were less than min. depth bases at the locus, after applying filters */
LOW_COVERAGE,
/** more than -maxDepth read at the locus, indicating some sort of mapping problem */
EXCESSIVE_COVERAGE,
/** more than --maxFractionOfReadsWithLowMAPQ at the locus, indicating a poor mapping quality of the reads */
POOR_MAPPING_QUALITY
}
////////////////////////////////////////////////////////////////////////////////////
// STANDARD WALKER METHODS
////////////////////////////////////////////////////////////////////////////////////
@Override
public boolean includeReadsWithDeletionAtLoci() { return true; }
@Override
public void initialize() {
if ( getToolkit().getSamples().size() != 2 ) {
// unbelievably there are actually two samples even when there's just one in the header. God I hate this Samples system
throw new UserException.BadArgumentValue("-I", "CallableLoci only works for a single sample, but multiple samples were found in the provided BAM files: " + getToolkit().getSamples());
}
try {
PrintStream summaryOut = new PrintStream(summaryFile);
summaryOut.close();
@ -94,15 +240,15 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
}
}
public static class Integrator {
long counts[] = new long[CalledState.values().length];
protected static class Integrator {
final long counts[] = new long[CalledState.values().length];
CallableBaseState state = null;
}
public static class CallableBaseState implements HasGenomeLocation {
public GenomeLocParser genomeLocParser;
protected static class CallableBaseState implements HasGenomeLocation {
final public GenomeLocParser genomeLocParser;
public GenomeLoc loc;
public CalledState state;
final public CalledState state;
public CallableBaseState(GenomeLocParser genomeLocParser,GenomeLoc loc, CalledState state) {
this.genomeLocParser = genomeLocParser;
@ -133,16 +279,10 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
public String toString() {
return String.format("%s %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), state);
//return String.format("%s %d %d %d %s", loc.getContig(), loc.getStart(), loc.getStop(), loc.getStop() - loc.getStart() + 1, state);
}
}
public enum CalledState { REF_N, CALLABLE, NO_COVERAGE, LOW_COVERAGE, EXCESSIVE_COVERAGE, POOR_MAPPING_QUALITY }
public Integrator reduceInit() {
return new Integrator();
}
@Override
public CallableBaseState map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
CalledState state;
@ -179,6 +319,12 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
return new CallableBaseState(getToolkit().getGenomeLocParser(),context.getLocation(), state);
}
@Override
public Integrator reduceInit() {
return new Integrator();
}
@Override
public Integrator reduce(CallableBaseState state, Integrator integrator) {
// update counts
integrator.counts[state.getState().ordinal()]++;
@ -206,6 +352,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
// INTERVAL ON TRAVERSAL DONE
////////////////////////////////////////////////////////////////////////////////////
@Override
public void onTraversalDone(Integrator result) {
// print out the last state
if ( result != null ) {

View File

@ -32,8 +32,8 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec;
import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
@ -51,14 +51,48 @@ import java.io.PrintStream;
import java.util.*;
/**
* A parallelizable walker designed to quickly aggregate relevant coverage statistics across samples in the input
* file. Assesses the mean and median granular coverages of each sample, and generates part of a cumulative
* distribution of % bases and % targets covered for certain depths. The granularity of DOC can be set by command
* line arguments.
* Toolbox for assessing sequence coverage by a wide array of metrics, partitioned by sample, read group, or library
*
* <p>
* DepthOfCoverage processes a set of bam files to determine coverage at different levels of partitioning and
* aggregation. Coverage can be analyzed per locus, per interval, per gene, or in total; can be partitioned by
* sample, by read group, by technology, by center, or by library; and can be summarized by mean, median, quartiles,
* and/or percentage of bases covered to or beyond a threshold.
* Additionally, reads and bases can be filtered by mapping or base quality score.
*
* <h2>Input</h2>
* <p>
* One or more bam files (with proper headers) to be analyzed for coverage statistics
* (Optional) A REFSEQ Rod to aggregate coverage to the gene level
* </p>
*
* <h2>Output</h2>
* <p>
* Tables pertaining to different coverage summaries. Suffix on the table files declares the contents:
* - no suffix: per locus coverage
* - _summary: total, mean, median, quartiles, and threshold proportions, aggregated over all bases
* - _statistics: coverage histograms (# locus with X coverage), aggregated over all bases
* - _interval_summary: total, mean, median, quartiles, and threshold proportions, aggregated per interval
* - _interval_statistics: 2x2 table of # of intervals covered to >= X depth in >=Y samples
* - _gene_summary: total, mean, median, quartiles, and threshold proportions, aggregated per gene
* - _gene_statistics: 2x2 table of # of genes covered to >= X depth in >= Y samples
* - _cumulative_coverage_counts: coverage histograms (# locus with >= X coverage), aggregated over all bases
* - _cumulative_coverage_proportions: proprotions of loci with >= X coverage, aggregated over all bases
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantEval \
* -o file_name_base \
* -I input_bams.list
* [-geneList refSeq.sorted.txt] \
* [-pt readgroup] \
* [-ct 4 -ct 6 -ct 10] \
* [-L my_capture_genes.interval_list]
* </pre>
*
* @Author chartl
* @Date Feb 22, 2010
*/
// todo -- cache the map from sample names to means in the print functions, rather than regenerating each time
// todo -- support for granular histograms for total depth; maybe n*[start,stop], bins*sqrt(n)

View File

@ -38,12 +38,32 @@ import java.util.List;
/**
* Walks along reference and calculates the GC content for each interval.
*
*
* <h2>Input</h2>
* <p>
* One or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* GC content calculations per interval.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T GCContentByInterval \
* -o output.txt \
* -I input.bam \
* -L input.intervals
* </pre>
*
*/
@Allows(value = {DataSource.REFERENCE})
@Requires(value = {DataSource.REFERENCE})
@By(DataSource.REFERENCE)
public class GCContentByIntervalWalker extends LocusWalker<Long, Long> {
@Output
protected PrintStream out;

View File

@ -35,22 +35,53 @@ import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.Collections;
import java.util.List;
/**
* Generates an alternative reference sequence over the specified interval. Given variant ROD tracks,
* it replaces the reference bases at variation sites with the bases supplied by the ROD(s). Additionally,
* allows for a "snpmask" ROD to set overlapping bases to 'N'.
* Generates an alternative reference sequence over the specified interval.
*
* <p>
* Given variant ROD tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s).
* Additionally, allows for a "snpmask" ROD to set overlapping bases to 'N'.
*
* <h2>Input</h2>
* <p>
* The reference, requested intervals, and any number of variant rod files.
* </p>
*
* <h2>Output</h2>
* <p>
* A fasta file representing the requested intervals.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T FastaAlternateReferenceMaker \
* -o output.fasta \
* -L input.intervals \
* --variant input.vcf \
* [--snpmask mask.vcf]
* </pre>
*
*/
@WalkerName("FastaAlternateReferenceMaker")
@Reference(window=@Window(start=-1,stop=50))
@Requires(value={DataSource.REFERENCE})
public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
/**
* Variants from these input files are used by this tool to construct an alternate reference.
*/
@Input(fullName = "variant", shortName = "V", doc="variants to model", required=false)
public List<RodBinding<VariantContext>> variants;
public List<RodBinding<VariantContext>> variants = Collections.emptyList();
/**
* Snps from this file are used as a mask when constructing the alternate reference.
*/
@Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false)
public RodBinding<VariantContext> snpmask;
@ -66,17 +97,18 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker {
String refBase = String.valueOf((char)ref.getBase());
// Check to see if we have a called snp
for ( VariantContext vc : tracker.getValues(VariantContext.class) ) {
if ( ! vc.getSource().equals(snpmask.getName())) {
if ( vc.isDeletion()) {
deletionBasesRemaining = vc.getReference().length();
// delete the next n bases, not this one
return new Pair<GenomeLoc, String>(context.getLocation(), refBase);
} else if ( vc.isInsertion()) {
return new Pair<GenomeLoc, String>(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString()));
} else if (vc.isSNP()) {
return new Pair<GenomeLoc, String>(context.getLocation(), vc.getAlternateAllele(0).toString());
}
for ( VariantContext vc : tracker.getValues(variants) ) {
if ( vc.isFiltered() )
continue;
if ( vc.isSimpleDeletion()) {
deletionBasesRemaining = vc.getReference().length();
// delete the next n bases, not this one
return new Pair<GenomeLoc, String>(context.getLocation(), refBase);
} else if ( vc.isSimpleInsertion()) {
return new Pair<GenomeLoc, String>(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString()));
} else if (vc.isSNP()) {
return new Pair<GenomeLoc, String>(context.getLocation(), vc.getAlternateAllele(0).toString());
}
}

View File

@ -38,14 +38,44 @@ import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
/**
* Renders a new reference in FASTA format consisting of only those loci provided in the input data set. Has optional
* features to control the output format.
* Renders a new reference in FASTA format consisting of only those loci provided in the input data set.
*
* <p>
* The output format can be partially controlled using the provided command-line arguments.
*
* <h2>Input</h2>
* <p>
* The reference and requested intervals.
* </p>
*
* <h2>Output</h2>
* <p>
* A fasta file representing the requested intervals.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T FastaReference \
* -o output.fasta \
* -L input.intervals
* </pre>
*
*/
@WalkerName("FastaReferenceMaker")
public class FastaReferenceWalker extends RefWalker<Pair<GenomeLoc, String>, GenomeLoc> {
@Output PrintStream out;
@Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false) public int fastaLineWidth=60;
@Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity) - CAUTION: adjacent intervals will automatically be merged", required=false) public boolean fastaRawSeqs=false;
@Argument(fullName="lineWidth", shortName="lw", doc="Maximum length of sequence to write per line", required=false)
public int fastaLineWidth=60;
/**
* Please note that when using this argument adjacent intervals will automatically be merged.
*/
@Argument(fullName="rawOnelineSeq", shortName="raw", doc="Print sequences with no FASTA header lines, one line per interval (i.e. lineWidth = infinity)", required=false)
public boolean fastaRawSeqs=false;
protected FastaSequence fasta;

View File

@ -45,6 +45,34 @@ import java.util.*;
/**
* Filters variant calls using a number of user-selectable, parameterizable criteria.
*
* <p>
* VariantFiltration is a GATK tool for hard-filtering variant calls based on certain criteria.
* Records are hard-filtered by changing the value in the FILTER field to something other than PASS.
*
* <h2>Input</h2>
* <p>
* A variant set to filter.
* </p>
*
* <h2>Output</h2>
* <p>
* A filtered VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantFiltration \
* -o output.vcf \
* --variant input.vcf \
* --filterExpression "AB < 0.2 || MQ0 > 50" \
* --filterName "Nov09filters" \
* --mask mask.vcf \
* --maskName InDel
* </pre>
*
*/
@Reference(window=@Window(start=-50,stop=50))
public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
@ -52,33 +80,65 @@ public class VariantFiltrationWalker extends RodWalker<Integer, Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* Any variant which overlaps entries from the provided mask rod will be filtered.
*/
@Input(fullName="mask", doc="Input ROD mask", required=false)
public RodBinding<Feature> mask;
@Output(doc="File to which variants should be written", required=true)
protected VCFWriter writer = null;
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter (see wiki docs for more info)", required=false)
/**
* VariantFiltration accepts any number of JEXL expressions (so you can have two named filters by using
* --filterName One --filterExpression "X < 1" --filterName Two --filterExpression "X > 2").
*/
@Argument(fullName="filterExpression", shortName="filter", doc="One or more expression used with INFO fields to filter", required=false)
protected ArrayList<String> FILTER_EXPS = new ArrayList<String>();
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
/**
* This name is put in the FILTER field for variants that get filtered. Note that there must be a 1-to-1 mapping between filter expressions and filter names.
*/
@Argument(fullName="filterName", shortName="filterName", doc="Names to use for the list of filters", required=false)
protected ArrayList<String> FILTER_NAMES = new ArrayList<String>();
/**
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
* VariantFiltration will add the sample-level FT tag to the FORMAT field of filtered samples (this does not affect the record's FILTER tag).
* One can filter normally based on most fields (e.g. "GQ < 5.0"), but the GT (genotype) field is an exception. We have put in convenience
* methods so that one can now filter out hets ("isHet == 1"), refs ("isHomRef == 1"), or homs ("isHomVar == 1").
*/
@Argument(fullName="genotypeFilterExpression", shortName="G_filter", doc="One or more expression used with FORMAT (sample/genotype-level) fields to filter (see wiki docs for more info)", required=false)
protected ArrayList<String> GENOTYPE_FILTER_EXPS = new ArrayList<String>();
/**
* Similar to the INFO field based expressions, but used on the FORMAT (genotype) fields instead.
*/
@Argument(fullName="genotypeFilterName", shortName="G_filterName", doc="Names to use for the list of sample/genotype filters (must be a 1-to-1 mapping); this name is put in the FILTER field for variants that get filtered", required=false)
protected ArrayList<String> GENOTYPE_FILTER_NAMES = new ArrayList<String>();
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster (see also --clusterWindowSize); [default:3]", required=false)
/**
* Works together with the --clusterWindowSize argument.
*/
@Argument(fullName="clusterSize", shortName="cluster", doc="The number of SNPs which make up a cluster", required=false)
protected Integer clusterSize = 3;
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs (to disable the clustered SNP filter, set this value to less than 1); [default:0]", required=false)
/**
* Works together with the --clusterSize argument. To disable the clustered SNP filter, set this value to less than 1.
*/
@Argument(fullName="clusterWindowSize", shortName="window", doc="The window size (in bases) in which to evaluate clustered SNPs", required=false)
protected Integer clusterWindow = 0;
@Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered; [default:0]", required=false)
@Argument(fullName="maskExtension", shortName="maskExtend", doc="How many bases beyond records from a provided 'mask' rod should variants be filtered", required=false)
protected Integer MASK_EXTEND = 0;
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call; [default:'Mask']", required=false)
@Argument(fullName="maskName", shortName="maskName", doc="The text to put in the FILTER field if a 'mask' rod is provided and overlaps with a variant call", required=false)
protected String MASK_NAME = "Mask";
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, should missing values be considered failing the expression (by default they are considered passing)?", required=false)
/**
* By default, if JEXL cannot evaluate your expression for a particular record because one of the annotations is not present, the whole expression evaluates as PASSing.
* Use this argument to have it evaluate as failing filters instead for these cases.
*/
@Argument(fullName="missingValuesInExpressionsShouldEvaluateAsFailing", doc="When evaluating the JEXL expressions, missing values should be considered failing the expression", required=false)
protected Boolean FAIL_MISSING_VALUES = false;
// JEXL expressions for the filters

View File

@ -44,7 +44,9 @@ import java.util.Set;
public abstract class AlleleFrequencyCalculationModel implements Cloneable {
public enum Model {
/** The default model with the best performance in all cases */
EXACT,
/** For posterity we have kept around the older GRID_SEARCH model, but this gives inferior results and shouldn't be used. */
GRID_SEARCH
}

View File

@ -53,7 +53,9 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable {
}
public enum GENOTYPING_MODE {
/** the default; the Unified Genotyper will choose the most likely alternate allele */
DISCOVERY,
/** only the alleles passed in from a VCF rod bound to the -alleles argument will be used for genotyping */
GENOTYPE_GIVEN_ALLELES
}

View File

@ -36,31 +36,54 @@ import java.io.File;
public class UnifiedArgumentCollection {
// control the various models to be used
@Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false)
public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
/**
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
*/
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ -- EXACT is the default option, while GRID_SEARCH is also available.", required = false)
public AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT;
/**
* The expected heterozygosity value used to compute prior likelihoods for any locus. The default priors are:
* het = 1e-3, P(hom-ref genotype) = 1 - 3 * het / 2, P(het genotype) = het, P(hom-var genotype) = het / 2
*/
@Argument(fullName = "heterozygosity", shortName = "hets", doc = "Heterozygosity value used to compute prior likelihoods for any locus", required = false)
public Double heterozygosity = DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY;
@Argument(fullName = "pcr_error_rate", shortName = "pcr_error", doc = "The PCR error rate to be used for computing fragment-based likelihoods", required = false)
public Double PCR_error = DiploidSNPGenotypeLikelihoods.DEFAULT_PCR_ERROR_RATE;
/**
* Specifies how to determine the alternate allele to use for genotyping
*/
@Argument(fullName = "genotyping_mode", shortName = "gt_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
public GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY;
@Argument(fullName = "output_mode", shortName = "out_mode", doc = "Should we output confident genotypes (i.e. including ref calls) or just the variants?", required = false)
public UnifiedGenotyperEngine.OUTPUT_MODE OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY;
/**
* The minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. Only genotypes with
* confidence >= this threshold are emitted as called sites. A reasonable threshold is 30 for high-pass calling (this
* is the default). Note that the confidence (QUAL) values for multi-sample low-pass (e.g. 4x per sample) calling might
* be significantly smaller with the new EXACT model than with our older GRID_SEARCH model, as the latter tended to
* over-estimate the confidence; for low-pass calling we tend to use much smaller thresholds (e.g. 4).
*/
@Argument(fullName = "standard_min_confidence_threshold_for_calling", shortName = "stand_call_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be called", required = false)
public double STANDARD_CONFIDENCE_FOR_CALLING = 30.0;
/**
* the minimum phred-scaled Qscore threshold to emit low confidence calls. Genotypes with confidence >= this but less
* than the calling threshold are emitted but marked as filtered.
*/
@Argument(fullName = "standard_min_confidence_threshold_for_emitting", shortName = "stand_emit_conf", doc = "The minimum phred-scaled confidence threshold at which variants not at 'trigger' track sites should be emitted (and filtered if less than the calling threshold)", required = false)
public double STANDARD_CONFIDENCE_FOR_EMITTING = 30.0;
/**
* This argument is not enabled by default because it increases the runtime by an appreciable amount.
*/
@Argument(fullName = "computeSLOD", shortName = "sl", doc = "If provided, we will calculate the SLOD", required = false)
public boolean COMPUTE_SLOD = false;
@ -80,7 +103,6 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "abort_at_too_much_coverage", doc = "Don't call a site if the downsampled coverage is greater than this value", required = false)
public int COVERAGE_AT_WHICH_TO_ABORT = -1;
// control the various parameters to be used
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for calling", required = false)
public int MIN_BASE_QUALTY_SCORE = 17;
@ -91,11 +113,17 @@ public class UnifiedArgumentCollection {
@Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false)
public Double MAX_DELETION_FRACTION = 0.05;
// indel-related arguments
/**
* A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site.
* Decreasing this value will increase sensitivity but at the cost of larger calling time and a larger number of false positives.
*/
@Argument(fullName = "min_indel_count_for_genotyping", shortName = "minIndelCnt", doc = "Minimum number of consensus indels required to trigger genotyping run", required = false)
public int MIN_INDEL_COUNT_FOR_GENOTYPING = 5;
/**
* This argument informs the prior probability of having an indel at a site.
*/
@Argument(fullName = "indel_heterozygosity", shortName = "indelHeterozygosity", doc = "Heterozygosity for indel calling", required = false)
public double INDEL_HETEROZYGOSITY = 1.0/8000;
@ -126,22 +154,23 @@ public class UnifiedArgumentCollection {
@Hidden
@Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false)
public boolean OUTPUT_DEBUG_INDEL_INFO = false;
@Hidden
@Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false)
public boolean dovit = false;
@Hidden
@Argument(fullName = "GSA_PRODUCTION_ONLY", shortName = "GSA_PRODUCTION_ONLY", doc = "don't ever use me", required = false)
public boolean GSA_PRODUCTION_ONLY = false;
@Hidden
@Argument(fullName = "exactCalculation", shortName = "exactCalculation", doc = "expt", required = false)
public ExactAFCalculationModel.ExactCalculation EXACT_CALCULATION_TYPE = ExactAFCalculationModel.ExactCalculation.LINEAR_EXPERIMENTAL;
@Hidden
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
@Argument(fullName = "ignoreSNPAlleles", shortName = "ignoreSNPAlleles", doc = "expt", required = false)
public boolean IGNORE_SNP_ALLELES = false;
@Deprecated
@Argument(fullName="output_all_callable_bases", shortName="all_bases", doc="Please use --output_mode EMIT_ALL_SITES instead" ,required=false)
private Boolean ALL_BASES_DEPRECATED = false;

View File

@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
@ -45,13 +45,73 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
import java.util.*;
/**
* A variant caller which unifies the approaches of several disparate callers. Works for single-sample and
* multi-sample data. The user can choose from several different incorporated calculation models.
* A variant caller which unifies the approaches of several disparate callers -- Works for single-sample and multi-sample data.
*
* <p>
* The GATK Unified Genotyper is a multiple-sample, technology-aware SNP and indel caller. It uses a Bayesian genotype
* likelihood model to estimate simultaneously the most likely genotypes and allele frequency in a population of N samples,
* emitting an accurate posterior probability of there being a segregating variant allele at each locus as well as for the
* genotype of each sample. The system can either emit just the variant sites or complete genotypes (which includes
* homozygous reference calls) satisfying some phred-scaled confidence value. The genotyper can make accurate calls on
* both single sample data and multi-sample data.
*
* <h2>Input</h2>
* <p>
* The read data from which to make variant calls.
* </p>
*
* <h2>Output</h2>
* <p>
* A raw, unfiltered, highly specific callset in VCF format.
* </p>
*
* <h2>Example generic command for multi-sample SNP calling</h2>
* <pre>
* java -jar GenomeAnalysisTK.jar \
* -R resources/Homo_sapiens_assembly18.fasta \
* -T UnifiedGenotyper \
* -I sample1.bam [-I sample2.bam ...] \
* --dbsnp dbSNP.vcf \
* -o snps.raw.vcf \
* -stand_call_conf [50.0] \
* -stand_emit_conf 10.0 \
* -dcov [50] \
* [-L targets.interval_list]
* </pre>
*
* <p>
* The above command will call all of the samples in your provided BAM files [-I arguments] together and produce a VCF file
* with sites and genotypes for all samples. The easiest way to get the dbSNP file is from the GATK resource bundle. Several
* arguments have parameters that should be chosen based on the average coverage per sample in your data. See the detailed
* argument descriptions below.
* </p>
*
* <h2>Example command for generating calls at all sites</h2>
* <pre>
* java -jar /path/to/GenomeAnalysisTK.jar \
* -l INFO \
* -R resources/Homo_sapiens_assembly18.fasta \
* -T UnifiedGenotyper \
* -I /DCC/ftp/pilot_data/data/NA12878/alignment/NA12878.SLX.maq.SRP000031.2009_08.bam \
* -o my.vcf \
* --output_mode EMIT_ALL_SITES
* </pre>
*
* <h2>Caveats</h2>
* <ul>
* <li>The system is under active and continuous development. All outputs, the underlying likelihood model, arguments, and
* file formats are likely to change.</li>
* <li>The system can be very aggressive in calling variants. In the 1000 genomes project for pilot 2 (deep coverage of ~35x)
* we expect the raw Qscore > 50 variants to contain at least ~10% FP calls. We use extensive post-calling filters to eliminate
* most of these FPs. Variant Quality Score Recalibration is a tool to perform this filtering.</li>
* <li>We only handle diploid genotypes</li>
* </ul>
*
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT)
@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableReadFilter.class} )
@ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} )
@Reference(window=@Window(start=-200,stop=200))
@By(DataSource.REFERENCE)
@Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250)
@ -61,10 +121,9 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
/**
* A dbSNP VCF file from which to annotate.
*
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
*/
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
* dbSNP is not used in any way for the calculations themselves.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
@ -72,7 +131,9 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
public List<RodBinding<VariantContext>> getCompRodBindings() { return Collections.emptyList(); }
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
// control the output
/**
* A raw, unfiltered, highly specific callset in VCF format.
*/
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter writer = null;
@ -82,9 +143,15 @@ public class UnifiedGenotyper extends LocusWalker<VariantCallContext, UnifiedGen
@Argument(fullName = "metrics_file", shortName = "metrics", doc = "File to print any relevant callability metrics output", required = false)
protected PrintStream metricsWriter = null;
/**
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
*/
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
protected List<String> annotationsToUse = new ArrayList<String>();
/**
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
*/
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
protected String[] annotationClassesToUse = { "Standard" };

View File

@ -51,8 +51,11 @@ public class UnifiedGenotyperEngine {
public static final String LOW_QUAL_FILTER_NAME = "LowQual";
public enum OUTPUT_MODE {
/** the default */
EMIT_VARIANTS_ONLY,
/** include confident reference sites */
EMIT_ALL_CONFIDENT_SITES,
/** any callable site regardless of confidence */
EMIT_ALL_SITES
}
@ -484,6 +487,9 @@ public class UnifiedGenotyperEngine {
Map<String, AlignmentContext> stratifiedContexts = null;
if ( !BaseUtils.isRegularBase( refContext.getBase() ) )
return null;
if ( model == GenotypeLikelihoodsCalculationModel.Model.INDEL ) {
if (UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES) {
@ -498,6 +504,7 @@ public class UnifiedGenotyperEngine {
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup, UAC.ASSUME_SINGLE_SAMPLE);
} else {
// todo - tmp will get rid of extended events so this wont be needed
if (!rawContext.hasExtendedEventPileup())
return null;
@ -515,9 +522,6 @@ public class UnifiedGenotyperEngine {
}
} else if ( model == GenotypeLikelihoodsCalculationModel.Model.SNP ) {
if ( !BaseUtils.isRegularBase( refContext.getBase() ) )
return null;
// stratify the AlignmentContext and cut by sample
stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(rawContext.getBasePileup(), UAC.ASSUME_SINGLE_SAMPLE);

View File

@ -65,10 +65,53 @@ import java.util.*;
/**
* Performs local realignment of reads based on misalignments due to the presence of indels.
* Unlike most mappers, this walker uses the full alignment context to determine whether an
* appropriate alternate reference (i.e. indel) exists and updates SAMRecords accordingly.
*
* <p>
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
* or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
* also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus
* indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an
* appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and
* specifically identify indels.
* <p>
* <ol>There are 2 steps to the realignment process:
* <li>Determining (small) suspicious intervals which are likely in need of realignment (see the RealignerTargetCreator tool)</li>
* <li>Running the realigner over those intervals (IndelRealigner)</li>
* </ol>
* <p>
* An important note: the input bam(s), reference, and known indel file(s) should be the same ones used for the RealignerTargetCreator step.
* <p>
* Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
* (or with reads from similar technologies).
*
* <h2>Input</h2>
* <p>
* One or more aligned BAM files and optionally one or more lists of known indels.
* </p>
*
* <h2>Output</h2>
* <p>
* A realigned version of your input BAM file(s).
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -I input.bam \
* -R ref.fasta \
* -T IndelRealigner \
* -targetIntervals intervalListFromRTC.intervals \
* -o realignedBam.bam \
* [--known /path/to/indels.vcf] \
* [-compress 0] (this argument recommended to speed up the process *if* this is only a temporary file; otherwise, use the default value)
* </pre>
*
* @author ebanks
*/
//Reference(window=@Window(start=-30,stop=30))
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
public class IndelRealigner extends ReadWalker<Integer, Integer> {
@ -77,88 +120,145 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
public static final String PROGRAM_RECORD_NAME = "GATK IndelRealigner";
public enum ConsensusDeterminationModel {
/**
* Uses only indels from a provided ROD of known indels.
*/
KNOWNS_ONLY,
/**
* Additionally uses indels already present in the original alignments of the reads.
*/
USE_READS,
/**
* Additionally uses 'Smith-Waterman' to generate alternate consenses.
*/
USE_SW
}
@Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false)
/**
* Any number of VCF files representing known indels to be used for constructing alternate consenses.
* Could be e.g. dbSNP and/or official 1000 Genomes indel calls. Non-indel variants in these files will be ignored.
*/
@Input(fullName="known", shortName = "known", doc="Input VCF file(s) with known indels", required=false)
public List<RodBinding<VariantContext>> known = Collections.emptyList();
/**
* The interval list output from the RealignerTargetCreator tool using the same bam(s), reference, and known indel file(s).
*/
@Input(fullName="targetIntervals", shortName="targetIntervals", doc="intervals file output from RealignerTargetCreator", required=true)
protected String intervalsFile = null;
/**
* This term is equivalent to "significance" - i.e. is the improvement significant enough to merit realignment? Note that this number
* should be adjusted based on your particular data set. For low coverage and/or when looking for indels with low allele frequency,
* this number should be smaller.
*/
@Argument(fullName="LODThresholdForCleaning", shortName="LOD", doc="LOD threshold above which the cleaner will clean", required=false)
protected double LOD_THRESHOLD = 5.0;
@Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
protected double MISMATCH_THRESHOLD = 0.15;
/**
* The realigned bam file.
*/
@Output(required=false, doc="Output bam")
protected StingSAMFileWriter writer = null;
protected ConstrainedMateFixingManager manager = null;
protected SAMFileWriter writerToUse = null;
@Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "How should we determine the possible alternate consenses? -- in the order of least permissive to most permissive there is KNOWNS_ONLY (use only indels from known indels provided in RODs), USE_READS (additionally use indels already present in the original alignments of the reads), and USE_SW (additionally use 'Smith-Waterman' to generate alternate consenses). The default is USE_READS", required = false)
/**
* We recommend that users run with USE_READS when trying to realign high quality longer read data mapped with a gapped aligner;
* Smith-Waterman is really only necessary when using an ungapped aligner (e.g. MAQ in the case of single-end read data).
*/
@Argument(fullName = "consensusDeterminationModel", shortName = "model", doc = "Determines how to compute the possible alternate consenses", required = false)
public ConsensusDeterminationModel consensusModel = ConsensusDeterminationModel.USE_READS;
// ADVANCED OPTIONS FOLLOW
@Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
"Keep it low to minimize memory consumption (but the tool may skip realignment on regions with too much coverage. If it is too low, it may generate errors during realignment); keep it high to maximize realignment (but make sure to give Java enough memory).", required=false)
/**
* For expert users only! This is similar to the argument in the RealignerTargetCreator walker. The point here is that the realigner
* will only proceed with the realignment (even above the given threshold) if it minimizes entropy among the reads (and doesn't simply
* push the mismatch column to another position). This parameter is just a heuristic and should be adjusted based on your particular data set.
*/
@Advanced
@Argument(fullName="entropyThreshold", shortName="entropy", doc="percentage of mismatches at a locus to be considered having high entropy", required=false)
protected double MISMATCH_THRESHOLD = 0.15;
/**
* For expert users only! To minimize memory consumption you can lower this number (but then the tool may skip realignment on regions with too much coverage;
* and if the number is too low, it may generate errors during realignment). Just make sure to give Java enough memory! 4Gb should be enough with the default value.
*/
@Advanced
@Argument(fullName="maxReadsInMemory", shortName="maxInMemory", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter", required=false)
protected int MAX_RECORDS_IN_MEMORY = 150000;
/**
* For expert users only!
*/
@Advanced
@Argument(fullName="maxIsizeForMovement", shortName="maxIsize", doc="maximum insert size of read pairs that we attempt to realign", required=false)
protected int MAX_ISIZE_FOR_MOVEMENT = 3000;
/**
* For expert users only!
*/
@Advanced
@Argument(fullName="maxPositionalMoveAllowed", shortName="maxPosMove", doc="maximum positional move in basepairs that a read can be adjusted during realignment", required=false)
protected int MAX_POS_MOVE_ALLOWED = 200;
/**
* For expert users only! If you need to find the optimal solution regardless of running time, use a higher number.
*/
@Advanced
@Argument(fullName="maxConsensuses", shortName="maxConsensuses", doc="max alternate consensuses to try (necessary to improve performance in deep coverage)", required=false)
protected int MAX_CONSENSUSES = 30;
/**
* For expert users only! If you need to find the optimal solution regardless of running time, use a higher number.
*/
@Advanced
@Argument(fullName="maxReadsForConsensuses", shortName="greedy", doc="max reads used for finding the alternate consensuses (necessary to improve performance in deep coverage)", required=false)
protected int MAX_READS_FOR_CONSENSUSES = 120;
@Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment; "+
"if this value is exceeded, realignment is not attempted and the reads are passed to the output file(s) as-is", required=false)
/**
* For expert users only! If this value is exceeded at a given interval, realignment is not attempted and the reads are passed to the output file(s) as-is.
* If you need to allow more reads (e.g. with very deep coverage) regardless of memory, use a higher number.
*/
@Advanced
@Argument(fullName="maxReadsForRealignment", shortName="maxReads", doc="max reads allowed at an interval for realignment", required=false)
protected int MAX_READS = 20000;
@Argument(fullName="noPGTag", shortName="noPG", required=false,
doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. "+
"This option is required in order to pass integration tests.")
protected boolean NO_PG_TAG = false;
@Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false,
doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam.")
@Advanced
@Argument(fullName="noOriginalAlignmentTags", shortName="noTags", required=false, doc="Don't output the original cigar or alignment start tags for each realigned read in the output bam")
protected boolean NO_ORIGINAL_ALIGNMENT_TAGS = false;
@Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false,
doc="This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, "+
"it will throw an exception. Use this argument when your interval list is not sorted to instruct "+"" +
"the Realigner to first sort it in memory.")
/**
* For expert users only! This tool assumes that the target interval list is sorted; if the list turns out to be unsorted, it will throw an exception.
* Use this argument when your interval list is not sorted to instruct the Realigner to first sort it in memory.
*/
@Advanced
@Argument(fullName="targetIntervalsAreNotSorted", shortName="targetNotSorted", required=false, doc="The target intervals are not sorted")
protected boolean TARGET_NOT_SORTED = false;
//NWay output: testing, not ready for the prime time, hence hidden:
@Hidden
@Argument(fullName="nWayOut", shortName="nWayOut", required=false,
doc="Generate one output file for each input (-I) bam file. Reads from all input files "+
"will be realigned together, but then each read will be saved in the output file corresponding to "+
"the input file the read came from. There are two ways to generate output bam file names: 1) if the "+
"value of this argument is a general string (e.g. '.cleaned.bam'), then "+
"extensions (\".bam\" or \".sam\") will be stripped from the input file names and the provided string value "+
"will be pasted on instead; 2) if the value ends with a '.map' (e.g. input_output.map), then " +
"the two-column tab-separated file with the specified name must exist and list unique output file name (2nd column)" +
"for each input file name (1st column).")
/**
* Reads from all input files will be realigned together, but then each read will be saved in the output file corresponding to the input file that
* the read came from. There are two ways to generate output bam file names: 1) if the value of this argument is a general string (e.g. '.cleaned.bam'),
* then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the
* value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output
* file name (2nd column) for each input file name (1st column).
*/
@Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file")
protected String N_WAY_OUT = null;
// DEBUGGING OPTIONS FOLLOW
@Hidden
@Argument(fullName="check_early",shortName="check_early",required=false,doc="Do early check of reads against existing consensuses")
protected boolean CHECKEARLY = false;
// DEBUGGING OPTIONS FOLLOW
@Hidden
@Argument(fullName="noPGTag", shortName="noPG", required=false,
doc="Don't output the usual PG tag in the realigned bam file header. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.")
protected boolean NO_PG_TAG = false;
@Hidden
@Output(fullName="indelsFileForDebugging", shortName="indels", required=false, doc="Output file (text) for the indels found; FOR DEBUGGING PURPOSES ONLY")
@ -786,7 +886,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
for ( VariantContext knownIndel : knownIndelsToTry ) {
if ( knownIndel == null || !knownIndel.isIndel() || knownIndel.isComplexIndel() )
continue;
byte[] indelStr = knownIndel.isInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length());
byte[] indelStr = knownIndel.isSimpleInsertion() ? knownIndel.getAlternateAllele(0).getBases() : Utils.dupBytes((byte)'-', knownIndel.getReference().length());
int start = knownIndel.getStart() - leftmostIndex + 1;
Consensus c = createAlternateConsensus(start, reference, indelStr, knownIndel);
if ( c != null )
@ -988,11 +1088,11 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
if ( indexOnRef > 0 )
cigar.add(new CigarElement(indexOnRef, CigarOperator.M));
if ( indel.isDeletion() ) {
if ( indel.isSimpleDeletion() ) {
refIdx += indelStr.length;
cigar.add(new CigarElement(indelStr.length, CigarOperator.D));
}
else if ( indel.isInsertion() ) {
else if ( indel.isSimpleInsertion() ) {
for ( byte b : indelStr )
sb.append((char)b);
cigar.add(new CigarElement(indelStr.length, CigarOperator.I));

View File

@ -35,16 +35,46 @@ import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
/**
* Left aligns indels in reads.
* Left-aligns indels from reads in a bam file.
*
* <p>
* LeftAlignIndels is a tool that takes a bam file and left-aligns any indels inside it. The same indel can often be
* placed at multiple positions and still represent the same haplotype. While a standard convention is to place an
* indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
*
* <h2>Input</h2>
* <p>
* A bam file to left-align.
* </p>
*
* <h2>Output</h2>
* <p>
* A left-aligned bam.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx3g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T LeftAlignIndels \
* -I input.bam \
* -o output.vcf
* </pre>
*
*/
public class LeftAlignIndels extends ReadWalker<Integer, Integer> {
@Output(required=false, doc="Output bam")
protected StingSAMFileWriter writer = null;
@Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the SAMFileWriter. "+
"If too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool may run out of memory.", required=false)
/**
* If set too low, the tool may run out of system file descriptors needed to perform sorting; if too high, the tool
* may run out of memory. We recommend that you additionally tell Java to use a temp directory with plenty of available
* space (by setting java.io.tempdir on the command-line).
*/
@Argument(fullName="maxReadsInRam", shortName="maxInRam", doc="max reads allowed to be kept in memory at a time by the output writer", required=false)
protected int MAX_RECORDS_IN_RAM = 500000;
public void initialize() {

View File

@ -33,7 +33,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.BadCigarFilter;
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
@ -52,38 +52,94 @@ import java.util.Collections;
import java.util.List;
/**
* Emits intervals for the Local Indel Realigner to target for cleaning. Ignores 454 reads, MQ0 reads, and reads with consecutive indel operators in the CIGAR string.
* Emits intervals for the Local Indel Realigner to target for realignment.
*
* <p>
* The local realignment tool is designed to consume one or more BAM files and to locally realign reads such that the number of mismatching bases
* is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion
* or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching
* the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently,
* it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are
* correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel,
* also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus
* indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an
* appropriate alternate reference (i.e. indel) exists. Following local realignment, the GATK tool Unified Genotyper can be used to sensitively and
* specifically identify indels.
* <p>
* <ol>There are 2 steps to the realignment process:
* <li>Determining (small) suspicious intervals which are likely in need of realignment (RealignerTargetCreator)</li>
* <li>Running the realigner over those intervals (see the IndelRealigner tool)</li>
* </ol>
* <p>
* An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step.
* <p>
* Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them
* (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string.
*
* <h2>Input</h2>
* <p>
* One or more aligned BAM files and optionally one or more lists of known indels.
* </p>
*
* <h2>Output</h2>
* <p>
* A list of target intervals to pass to the Indel Realigner.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -I input.bam \
* -R ref.fasta \
* -T RealignerTargetCreator \
* -o forIndelRealigner.intervals \
* [--known /path/to/indels.vcf]
* </pre>
*
* @author ebanks
*/
@ReadFilters({Platform454Filter.class, MappingQualityZeroReadFilter.class, BadCigarFilter.class})
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, BadCigarFilter.class})
@Reference(window=@Window(start=-1,stop=50))
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@By(DataSource.REFERENCE)
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
public class RealignerTargetCreator extends RodWalker<RealignerTargetCreator.Event, RealignerTargetCreator.Event> {
/**
* The target intervals for realignment.
*/
@Output
protected PrintStream out;
/**
* Any number of VCF files representing known SNPs and/or indels. Could be e.g. dbSNP and/or official 1000 Genomes indel calls.
* SNPs in these files will be ignored unless the --mismatchFraction argument is used.
*/
@Input(fullName="known", shortName = "known", doc="Input VCF file with known indels", required=false)
public List<RodBinding<VariantContext>> known = Collections.emptyList();
// mismatch/entropy/SNP arguments
/**
* Any two SNP calls and/or high entropy positions are considered clustered when they occur no more than this many basepairs apart.
*/
@Argument(fullName="windowSize", shortName="window", doc="window size for calculating entropy or SNP clusters", required=false)
protected int windowSize = 10;
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy; to disable set to <= 0 or > 1", required=false)
/**
* To disable this behavior, set this value to <= 0 or > 1. This feature is really only necessary when using an ungapped aligner
* (e.g. MAQ in the case of single-end read data) and should be used in conjunction with '--model USE_SW' in the IndelRealigner.
*/
@Argument(fullName="mismatchFraction", shortName="mismatch", doc="fraction of base qualities needing to mismatch for a position to have high entropy", required=false)
protected double mismatchThreshold = 0.0;
@Argument(fullName="minReadsAtLocus", shortName="minReads", doc="minimum reads at a locus to enable using the entropy calculation", required=false)
protected int minReadsAtLocus = 4;
// interval merging arguments
/**
* Because the realignment algorithm is N^2, allowing too large an interval might take too long to completely realign.
*/
@Argument(fullName="maxIntervalSize", shortName="maxInterval", doc="maximum interval size", required=false)
protected int maxIntervalSize = 500;
@Deprecated
@Argument(fullName="realignReadsWithBadMates", doc="This argument is no longer used.", required=false)
protected boolean DEPRECATED_REALIGN_MATES = false;
@Override
public boolean generateExtendedEvents() { return true; }
@ -122,7 +178,7 @@ public class RealignerTargetCreator extends RodWalker<RealignerTargetCreator.Eve
switch ( vc.getType() ) {
case INDEL:
hasIndel = true;
if ( vc.isInsertion() )
if ( vc.isSimpleInsertion() )
hasInsertion = true;
break;
case SNP:
@ -131,7 +187,7 @@ public class RealignerTargetCreator extends RodWalker<RealignerTargetCreator.Eve
case MIXED:
hasPointEvent = true;
hasIndel = true;
if ( vc.isInsertion() )
if ( vc.isSimpleInsertion() )
hasInsertion = true;
break;
default:

View File

@ -33,15 +33,15 @@ import org.broadinstitute.sting.commandline.Tags;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.filters.Platform454Filter;
import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter;
import org.broadinstitute.sting.gatk.filters.PlatformUnitFilterHelper;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.features.refseq.Transcript;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqCodec;
import org.broadinstitute.sting.gatk.refdata.features.refseq.RefSeqFeature;
import org.broadinstitute.sting.utils.codecs.refseq.Transcript;
import org.broadinstitute.sting.utils.codecs.refseq.RefSeqCodec;
import org.broadinstitute.sting.utils.codecs.refseq.RefSeqFeature;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
@ -78,7 +78,7 @@ import java.util.*;
* if first bam has coverage at the site but no indication for an indel. In the --somatic mode, BED output contains
* only somatic calls, while --verbose output contains all calls annotated with GERMLINE/SOMATIC keywords.
*/
@ReadFilters({Platform454Filter.class, MappingQualityZeroReadFilter.class, PlatformUnitFilter.class})
@ReadFilters({Platform454Filter.class, MappingQualityZeroFilter.class, PlatformUnitFilter.class})
public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
// @Output
// PrintStream out;
@ -469,10 +469,20 @@ public class SomaticIndelDetectorWalker extends ReadWalker<Integer,Integer> {
// let's double check now that the read fits after the shift
if ( read.getAlignmentEnd() > normal_context.getStop()) {
// ooops, looks like the read does not fit into the window even after the latter was shifted!!
throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+
"Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
// we used to die over such reads and require user to run with larger window size. Now we
// just print a warning and discard the read (this means that our counts can be slightly off in
// th epresence of such reads)
//throw new UserException.BadArgumentValue("window_size", "Read "+read.getReadName()+": out of coverage window bounds. Probably window is too small, so increase the value of the window_size argument.\n"+
// "Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
// read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+
// "; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop());
System.out.println("WARNING: Read "+read.getReadName()+
" is out of coverage window bounds. Probably window is too small and the window_size value must be increased.\n"+
" The read is ignored in this run (so all the counts/statistics reported will not include it).\n"+
" Read length="+read.getReadLength()+"; cigar="+read.getCigarString()+"; start="+
read.getAlignmentStart()+"; end="+read.getAlignmentEnd()+
"; window start (after trying to accomodate the read)="+normal_context.getStart()+"; window end="+normal_context.getStop());
return 1;
}
}

View File

@ -23,12 +23,15 @@
*/
package org.broadinstitute.sting.gatk.walkers.phasing;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.ArgumentCollection;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.BaseUtils;
@ -49,16 +52,46 @@ import java.util.*;
import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFromRods;
/**
* Walks along all variant ROD loci, caching a user-defined window of VariantContext sites, and then finishes phasing them when they go out of range (using upstream and downstream reads).
*
* <p>
* Performs physical phasing of SNP calls, based on sequencing reads.
* </p>
*
* <h2>Input</h2>
* <p>
* VCF file of SNP calls, BAM file of sequence reads.
* </p>
*
* <h2>Output</h2>
* <p>
* Phased VCF file.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java
* -jar GenomeAnalysisTK.jar
* -T ReadBackedPhasing
* -R reference.fasta
* -I reads.bam
* --variant:vcf SNPs.vcf
* -BTI variant
* -BTIMR INTERSECTION
* -o phased_SNPs.vcf
* --phaseQualityThresh 20.0
* </pre>
*
* @author Menachem Fromer
* @since July 2010
*/
@Allows(value = {DataSource.READS, DataSource.REFERENCE})
@Requires(value = {DataSource.READS, DataSource.REFERENCE})
@By(DataSource.READS)
@ReadFilters({MappingQualityZeroReadFilter.class})
// Filter out all reads with zero mapping quality
@ReadFilters({MappingQualityZeroFilter.class})
public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, PhasingStats> {
private static final boolean DEBUG = false;
@ -73,13 +106,13 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
@Output(doc = "File to which variants should be written", required = true)
protected VCFWriter writer = null;
@Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads; [default:20000]", required = false)
@Argument(fullName = "cacheWindowSize", shortName = "cacheWindow", doc = "The window size (in bases) to cache variant sites and their reads for the phasing procedure", required = false)
protected Integer cacheWindow = 20000;
@Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm; [default:10]", required = false)
@Argument(fullName = "maxPhaseSites", shortName = "maxSites", doc = "The maximum number of successive heterozygous sites permitted to be used by the phasing algorithm", required = false)
protected Integer maxPhaseSites = 10; // 2^10 == 10^3 diploid haplotypes
@Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing; [default:10.0]", required = false)
@Argument(fullName = "phaseQualityThresh", shortName = "phaseThresh", doc = "The minimum phasing quality score required to output phasing", required = false)
protected Double phaseQualityThresh = 10.0; // PQ = 10.0 <=> P(error) = 10^(-10/10) = 0.1, P(correct) = 0.9
@Hidden
@ -87,10 +120,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
protected String variantStatsFilePrefix = null;
private PhasingQualityStatsWriter statsWriter = null;
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing [default: 17]", required = false)
@Argument(fullName = "min_base_quality_score", shortName = "mbq", doc = "Minimum base quality required to consider a base for phasing", required = false)
public int MIN_BASE_QUALITY_SCORE = 17;
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing [default: 20]", required = false)
@Argument(fullName = "min_mapping_quality_score", shortName = "mmq", doc = "Minimum read mapping quality required to consider a read for phasing", required = false)
public int MIN_MAPPING_QUALITY_SCORE = 20;
@Argument(fullName = "sampleToPhase", shortName = "sampleToPhase", doc = "Only include these samples when phasing", required = false)
@ -111,10 +144,10 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
public static final String PHASING_INCONSISTENT_KEY = "PhasingInconsistent";
@Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records [default:false]", required = false)
@Argument(fullName = "enableMergePhasedSegregatingPolymorphismsToMNP", shortName = "enableMergeToMNP", doc = "Merge consecutive phased sites into MNP records", required = false)
protected boolean enableMergePhasedSegregatingPolymorphismsToMNP = false;
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record; [default:1]", required = false)
@Argument(fullName = "maxGenomicDistanceForMNP", shortName = "maxDistMNP", doc = "The maximum reference-genome distance between consecutive heterozygous sites to permit merging phased VCF records into a MNP record", required = false)
protected int maxGenomicDistanceForMNP = 1;
@Hidden

View File

@ -2,16 +2,18 @@ package org.broadinstitute.sting.gatk.walkers.qc;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
import java.util.Collections;
import java.util.List;
/**
@ -23,6 +25,9 @@ public class CountIntervals extends RefWalker<Long, Long> {
@Output
PrintStream out;
@Input(fullName="check", shortName = "check", doc="Any number of RODs", required=false)
public List<RodBinding<Feature>> features = Collections.emptyList();
@Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false)
int numOverlaps = 2;
@ -37,7 +42,7 @@ public class CountIntervals extends RefWalker<Long, Long> {
return null;
}
List<Feature> checkIntervals = tracker.getValues(Feature.class, "check");
List<Feature> checkIntervals = tracker.getValues(features);
return (long) checkIntervals.size();
}

View File

@ -11,7 +11,31 @@ import java.io.PrintStream;
/**
* Walks over the input data set, calculating the total number of covered loci for diagnostic purposes.
*
* <p>
* Simplest example of a locus walker.
*
*
* <h2>Input</h2>
* <p>
* One or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of loci traversed.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountLoci \
* -o output.txt \
* -I input.bam \
* [-L input.intervals]
* </pre>
*
*/
public class CountLociWalker extends LocusWalker<Integer, Long> implements TreeReducible<Long> {
@Output(doc="Write count to this file instead of STDOUT")

View File

@ -39,6 +39,26 @@ import java.util.List;
* query name order. Breaks counts down by total pairs and number
* of paired reads.
*
*
* <h2>Input</h2>
* <p>
* One or more bam files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of pairs seen.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountPairs \
* -o output.txt \
* -I input.bam
* </pre>
*
* @author mhanna
*/
public class CountPairsWalker extends ReadPairWalker<Integer,Long> {

View File

@ -25,7 +25,10 @@
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -33,25 +36,55 @@ import org.broadinstitute.sting.gatk.walkers.RefWalker;
import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.Pair;
import java.util.Collections;
import java.util.List;
/**
* Prints out counts of the number of reference ordered data objects are
* each locus for debugging RefWalkers.
* Prints out counts of the number of reference ordered data objects encountered.
*
*
* <h2>Input</h2>
* <p>
* One or more rod files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of rods seen.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountRODsByRef \
* -o output.txt \
* --rod input.vcf
* </pre>
*
*/
public class CountRodByRefWalker extends RefWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
public class CountRODsByRefWalker extends RefWalker<CountRODsWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> {
/**
* One or more input rod files
*/
@Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList();
@Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
public boolean showSkipped = false;
CountRodWalker crw = new CountRodWalker();
CountRODsWalker crw = new CountRODsWalker();
public void initialize() {
crw.verbose = verbose;
crw.showSkipped = showSkipped;
}
public CountRodWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
public CountRODsWalker.Datum map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
return crw.map(tracker, ref, context);
}
@ -59,7 +92,7 @@ public class CountRodByRefWalker extends RefWalker<CountRodWalker.Datum, Pair<Ex
return crw.reduceInit();
}
public Pair<ExpandingArrayList<Long>, Long> reduce(CountRodWalker.Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
public Pair<ExpandingArrayList<Long>, Long> reduce(CountRODsWalker.Datum point, Pair<ExpandingArrayList<Long>, Long> sum) {
return crw.reduce(point, sum);
}
}

View File

@ -27,8 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.qc;
import net.sf.samtools.SAMSequenceDictionary;
import net.sf.samtools.SAMSequenceRecord;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -41,23 +44,46 @@ import org.broadinstitute.sting.utils.collections.ExpandingArrayList;
import org.broadinstitute.sting.utils.collections.Pair;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.*;
/**
* Prints out counts of the number of reference ordered data objects are
* each locus for debugging RodWalkers.
* Prints out counts of the number of reference ordered data objects encountered.
*
*
* <h2>Input</h2>
* <p>
* One or more rod files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of rods seen.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountRODs \
* -o output.txt \
* --rod input.vcf
* </pre>
*
*/
public class CountRodWalker extends RodWalker<CountRodWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>> {
public class CountRODsWalker extends RodWalker<CountRODsWalker.Datum, Pair<ExpandingArrayList<Long>, Long>> implements TreeReducible<Pair<ExpandingArrayList<Long>, Long>> {
@Output
public PrintStream out;
@Argument(fullName = "verbose", shortName = "v", doc="If true, Countrod will print out detailed information about the rods it finds and locations", required = false)
/**
* One or more input rod files
*/
@Input(fullName="rod", shortName = "rod", doc="Input VCF file(s)", required=false)
public List<RodBinding<Feature>> rods = Collections.emptyList();
@Argument(fullName = "verbose", shortName = "v", doc="If true, this tool will print out detailed information about the rods it finds and locations", required = false)
public boolean verbose = false;
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, CountRod will print out the skippped locations", required = false)
@Argument(fullName = "showSkipped", shortName = "s", doc="If true, this tool will print out the skipped locations", required = false)
public boolean showSkipped = false;
@Override

View File

@ -9,8 +9,32 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
/**
* Walks over the input data set, calculating the number of reads seen for diagnostic purposes.
*
* <p>
* Can also count the number of reads matching a given criterion using read filters (see the
* --read-filter command line argument). Simplest example of a read-backed analysis.
*
*
* <h2>Input</h2>
* <p>
* One or more BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* Number of reads seen.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CountReads \
* -o output.txt \
* -I input.bam \
* [-L input.intervals]
* </pre>
*
*/
@Requires({DataSource.READS, DataSource.REFERENCE})
public class CountReadsWalker extends ReadWalker<Integer, Integer> {

View File

@ -0,0 +1,115 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.qc;
import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
/**
* Summary test
*
* <p>Body test</p>
*/
public class DocumentationTest extends RodWalker<Integer, Integer> {
// the docs for the arguments are in the collection
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants.
* Other sets can be specified with the -knownName (--known_names) argument.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
/**
* detailed documentation about the argument goes here.
*/
@Input(fullName="listofRodBinding", shortName = "disc", doc="Output variants that were not called in this Feature comparison track", required=false)
private List<RodBinding<VariantContext>> listOfRodBinding = Collections.emptyList();
@Input(fullName="optionalRodBinding", shortName = "conc", doc="Output variants that were also called in this Feature comparison track", required=false)
private RodBinding<VariantContext> concordanceTrack;
@Input(fullName="optionalRodBindingWithoutDefault", shortName = "optionalRodBindingWithoutDefault", doc="Output variants that were also called in this Feature comparison track", required=false)
private RodBinding<VariantContext> noDefaultOptionalRodBinding;
@Input(fullName="optionalRodBindingWithoutDefaultNull", shortName = "shortTest", doc="Output variants that were also called in this Feature comparison track", required=false)
private RodBinding<VariantContext> noDefaultOptionalRodBindingNull = null;
@Input(fullName="featureArg", shortName = "featureArg", doc="A RodBinding of feature", required=false)
private RodBinding<Feature> featureArg = null;
@Output(doc="VCFWriter",required=true)
protected VCFWriter vcfWriter = null;
@Advanced
@Argument(fullName="setString", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
public Set<String> sampleNames;
@Argument(fullName="setStringInitialized", shortName="setStringInitialized", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
public Set<String> setStringInitialized = new HashSet<String>();
@Argument(shortName="optionalArgWithMissinglessDefault", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false)
public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<String>();
@Argument(shortName="AAAAA", fullName = "AAAAA", doc="Should be the first argument", required=false)
public boolean FIRST_ARG = false;
@Advanced
@Argument(fullName="booleanArg", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
private boolean EXCLUDE_NON_VARIANTS = false;
@Advanced
@Argument(fullName="booleanArray", shortName="booleanArray", doc="x", required=false)
private boolean[] boolArray = null;
@Argument(fullName="enumTest", shortName="enumTest", doc="Test enum", required=false)
private TestEnum TestEnumArg = TestEnum.ENUM2;
public enum TestEnum {
/** Docs for enum1 */
ENUM1,
/** Docs for enum2 */
ENUM2
}
@Hidden
@Argument(fullName="hiddenArg", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
private boolean KEEP_AF_SPECTRUM = false;
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { return 0; }
public Integer reduceInit() { return 0; }
public Integer reduce(Integer value, Integer sum) { return value + sum; }
public void onTraversalDone(Integer result) { }
}

View File

@ -32,7 +32,7 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.sampileup.SAMPileupFeature;
import org.broadinstitute.sting.utils.codecs.sampileup.SAMPileupFeature;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;

View File

@ -1,3 +1,28 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.recalibration;
import org.broadinstitute.sting.commandline.Gatherer;
@ -12,11 +37,8 @@ import java.util.List;
import java.util.regex.Pattern;
/**
* Created by IntelliJ IDEA.
* User: carneiro
* Date: 3/29/11
* Time: 3:54 PM
* To change this template use File | Settings | File Templates.
*/

View File

@ -29,8 +29,8 @@ import org.broad.tribble.Feature;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableReadFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroReadFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter;
import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.BaseUtils;
@ -50,27 +50,54 @@ import java.util.List;
import java.util.Map;
/**
* First pass of the recalibration. Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
* First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide).
*
* This walker is designed to work as the first pass in a two-pass processing step.
* It does a by-locus traversal operating only at sites that are not in dbSNP.
* We assume that all reference mismatches we see are therefore errors and indicative of poor base quality.
* This walker generates tables based on various user-specified covariates (such as read group, reported quality score, cycle, and dinucleotide)
* Since there is a large amount of data one can then calculate an empirical probability of error
* given the particular covariates seen at this site, where p(error) = num mismatches / num observations
* The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score)
* The first non-comment line of the output file gives the name of the covariates that were used for this calculation.
* <p>
* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating
* only at sites that are not in dbSNP. We assume that all reference mismatches we see are therefore errors and indicative
* of poor base quality. This walker generates tables based on various user-specified covariates (such as read group,
* reported quality score, cycle, and dinucleotide). Since there is a large amount of data one can then calculate an empirical
* probability of error given the particular covariates seen at this site, where p(error) = num mismatches / num observations.
* The output file is a CSV list of (the several covariate values, num observations, num mismatches, empirical quality score).
* <p>
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified.
*
* Note: ReadGroupCovariate and QualityScoreCovariate are required covariates and will be added for the user regardless of whether or not they were specified
* Note: This walker is designed to be used in conjunction with TableRecalibrationWalker.
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* The input read data whose base quality scores need to be assessed.
* <p>
* A database of known polymorphic sites to skip over.
* </p>
*
* <h2>Output</h2>
* <p>
* A recalibration table file in CSV format that is used by the TableRecalibration walker.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -R resources/Homo_sapiens_assembly18.fasta \
* -knownSites bundle/hg18/dbsnp_132.hg18.vcf \
* -knownSites another/optional/setOfSitesToMask.vcf \
* -I my_reads.bam \
* -T CountCovariates \
* -cov ReadGroupCovariate \
* -cov QualityScoreCovariate \
* -cov CycleCovariate \
* -cov DinucCovariate \
* -recalFile my_reads.recal_data.csv
* </pre>
*
* @author rpoplin
* @since Nov 3, 2009
*/
@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN)
@By( DataSource.READS ) // Only look at covered loci, not every loci of the reference file
@ReadFilters( {MappingQualityZeroReadFilter.class, MappingQualityUnavailableReadFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
@ReadFilters( {MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class} ) // Filter out all reads with zero or unavailable mapping quality
@Requires( {DataSource.READS, DataSource.REFERENCE, DataSource.REFERENCE_BASES} ) // This walker requires both -I input.bam and -R reference.fasta
@PartitionBy(PartitionType.LOCUS)
public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.CountedData, CountCovariatesWalker.CountedData> implements TreeReducible<CountCovariatesWalker.CountedData> {
@ -96,14 +123,23 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
*/
@Input(fullName="knownSites", shortName = "knownSites", doc="A database of known polymorphic sites to skip over in the recalibration algorithm", required=false)
public List<RodBinding<Feature>> knownSites = Collections.emptyList();
@Output
PrintStream out;
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/
@Output(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the output covariates table recalibration file")
@Gather(CountCovariatesGatherer.class)
public PrintStream RECAL_FILE;
@Argument(fullName="list", shortName="ls", doc="List the available covariates and exit", required=false)
private boolean LIST_ONLY = false;
/**
* See the -list argument to view available covariates.
*/
@Argument(fullName="covariate", shortName="cov", doc="Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required=false)
private String[] COVARIATES = null;
@Argument(fullName="standard_covs", shortName="standard", doc="Use the standard set of covariates in addition to the ones listed using the -cov argument", required=false)
@ -114,6 +150,10 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
/////////////////////////////
@Argument(fullName="dont_sort_output", shortName="unsorted", required=false, doc="If specified, the output table recalibration csv file will be in an unsorted, arbitrary order to save some run time.")
private boolean DONT_SORT_OUTPUT = false;
/**
* This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option.
*/
@Argument(fullName="run_without_dbsnp_potentially_ruining_quality", shortName="run_without_dbsnp_potentially_ruining_quality", required=false, doc="If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.")
private boolean RUN_WITHOUT_DBSNP = false;
@ -178,11 +218,11 @@ public class CountCovariatesWalker extends LocusWalker<CountCovariatesWalker.Cou
// Print and exit if that's what was requested
if ( LIST_ONLY ) {
out.println( "Available covariates:" );
logger.info( "Available covariates:" );
for( Class<?> covClass : covariateClasses ) {
out.println( covClass.getSimpleName() );
logger.info( covClass.getSimpleName() );
}
out.println();
logger.info("");
System.exit( 0 ); // Early exit here because user requested it
}

View File

@ -66,15 +66,22 @@ public class RecalDataManager {
private static boolean warnUserNullPlatform = false;
public enum SOLID_RECAL_MODE {
/** Treat reference inserted bases as reference matching bases. Very unsafe! */
DO_NOTHING,
/** Set reference inserted bases and the previous base (because of color space alignment details) to Q0. This is the default option. */
SET_Q_ZERO,
/** In addition to setting the quality scores to zero, also set the base itself to 'N'. This is useful to visualize in IGV. */
SET_Q_ZERO_BASE_N,
/** Look at the color quality scores and probabilistically decide to change the reference inserted base to be the base which is implied by the original color space instead of the reference. */
REMOVE_REF_BIAS
}
public enum SOLID_NOCALL_STRATEGY {
/** When a no call is detected throw an exception to alert the user that recalibrating this SOLiD data is unsafe. This is the default option. */
THROW_EXCEPTION,
/** Leave the read in the output bam completely untouched. This mode is only okay if the no calls are very rare. */
LEAVE_READ_UNRECALIBRATED,
/** Mark these reads as failing vendor quality checks so they can be filtered out by downstream analyses. */
PURGE_READ
}

View File

@ -51,12 +51,27 @@ public class RecalibrationArgumentCollection {
public String FORCE_PLATFORM = null;
@Argument(fullName = "window_size_nqs", shortName="nqs", doc="The window size used by MinimumNQSCovariate for its calculation", required=false)
public int WINDOW_SIZE = 5;
/**
* This window size tells the module in how big of a neighborhood around the current base it should look for the minimum base quality score.
*/
@Argument(fullName = "homopolymer_nback", shortName="nback", doc="The number of previous bases to look at in HomopolymerCovariate", required=false)
public int HOMOPOLYMER_NBACK = 7;
@Argument(fullName = "exception_if_no_tile", shortName="throwTileException", doc="If provided, TileCovariate will throw an exception when no tile can be found. The default behavior is to use tile = -1", required=false)
public boolean EXCEPTION_IF_NO_TILE = false;
/**
* CountCovariates and TableRecalibration accept a --solid_recal_mode <MODE> flag which governs how the recalibrator handles the
* reads which have had the reference inserted because of color space inconsistencies.
*/
@Argument(fullName="solid_recal_mode", shortName="sMode", required = false, doc="How should we recalibrate solid bases in which the reference was inserted? Options = DO_NOTHING, SET_Q_ZERO, SET_Q_ZERO_BASE_N, or REMOVE_REF_BIAS")
public RecalDataManager.SOLID_RECAL_MODE SOLID_RECAL_MODE = RecalDataManager.SOLID_RECAL_MODE.SET_Q_ZERO;
/**
* CountCovariates and TableRecalibration accept a --solid_nocall_strategy <MODE> flag which governs how the recalibrator handles
* no calls in the color space tag. Unfortunately because of the reference inserted bases mentioned above, reads with no calls in
* their color space tag can not be recalibrated.
*/
@Argument(fullName = "solid_nocall_strategy", shortName="solid_nocall_strategy", doc="Defines the behavior of the recalibrator when it encounters no calls in the color space. Options = THROW_EXCEPTION, LEAVE_READ_UNRECALIBRATED, or PURGE_READ", required=false)
public RecalDataManager.SOLID_NOCALL_STRATEGY SOLID_NOCALL_STRATEGY = RecalDataManager.SOLID_NOCALL_STRATEGY.THROW_EXCEPTION;
}

View File

@ -52,19 +52,40 @@ import java.util.ResourceBundle;
import java.util.regex.Pattern;
/**
* Second pass of the recalibration. Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
* Second pass of the base quality score recalibration -- Uses the table generated by CountCovariates to update the base quality scores of the input bam file using a sequential table calculation making the base quality scores more accurately reflect the actual quality of the bases as measured by reference mismatch rate.
*
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal.
* <p>
* This walker is designed to work as the second pass in a two-pass processing step, doing a by-read traversal. For each
* base in each read this walker calculates various user-specified covariates (such as read group, reported quality score,
* cycle, and dinuc). Using these values as a key in a large hashmap the walker calculates an empirical base quality score
* and overwrites the quality score currently in the read. This walker then outputs a new bam file with these updated (recalibrated) reads.
*
* For each base in each read this walker calculates various user-specified covariates (such as read group, reported quality score, cycle, and dinuc)
* Using these values as a key in a large hashmap the walker calculates an empirical base quality score and overwrites the quality score currently in the read.
* This walker then outputs a new bam file with these updated (recalibrated) reads.
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration
*
* Note: This walker expects as input the recalibration table file generated previously by CovariateCounterWalker.
* Note: This walker is designed to be used in conjunction with CovariateCounterWalker.
* <h2>Input</h2>
* <p>
* The input read data whose base quality scores need to be recalibrated.
* <p>
* The recalibration table file in CSV format that was generated by the CountCovariates walker.
* </p>
*
* <h2>Output</h2>
* <p>
* A bam file in which the quality scores in each read have been recalibrated.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -R resources/Homo_sapiens_assembly18.fasta \
* -I my_reads.bam \
* -T TableRecalibration \
* -o my_reads.recal.bam \
* -recalFile my_reads.recal_data.csv
* </pre>
*
* @author rpoplin
* @since Nov 3, 2009
*/
@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT)
@ -79,24 +100,54 @@ public class TableRecalibrationWalker extends ReadWalker<SAMRecord, SAMFileWrite
/////////////////////////////
@ArgumentCollection private RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection();
@Input(fullName="recal_file", shortName="recalFile", required=false, doc="Filename for the input covariates table recalibration .csv file")
public File RECAL_FILE = new File("output.recal_data.csv");
/////////////////////////////
// Command Line Arguments
/////////////////////////////
@Argument(fullName="output_bam", shortName="outputBam", doc="Please use --out instead", required=false)
@Deprecated
protected String outbam;
@Output(doc="The output BAM file", required=true)
/**
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
*/
@Input(fullName="recal_file", shortName="recalFile", required=true, doc="Filename for the input covariates table recalibration .csv file")
public File RECAL_FILE = null;
/**
* A new bam file in which the quality scores in each read have been recalibrated. The alignment of the reads is left untouched.
*/
@Output(doc="The output recalibrated BAM file", required=true)
private StingSAMFileWriter OUTPUT_BAM = null;
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated, default=5. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
/**
* TableRacalibration accepts a --preserve_qscores_less_than / -pQ <Q> flag that instructs TableRecalibration to not modify
* quality scores less than <Q> but rather just write them out unmodified in the recalibrated BAM file. This is useful
* because Solexa writes Q2 and Q3 bases when the machine has really gone wrong. This would be fine in and of itself,
* but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect,
* your Q2 and Q3 bins can be elevated to Q8 or Q10, leading to issues downstream. With the default value of 5, all Q0-Q4 bases
* are unmodified during recalibration, so they don't get inappropriately evaluated.
*/
@Argument(fullName="preserve_qscores_less_than", shortName="pQ", doc="Bases with quality scores less than this threshold won't be recalibrated. In general it's unsafe to change qualities scores below < 5, since base callers use these values to indicate random or bad bases", required=false)
private int PRESERVE_QSCORES_LESS_THAN = 5;
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points, default=1")
/**
* By default TableRecalibration applies a Yates' correction to account for overfitting when it calculates the empirical
* quality score, in particular, ( # mismatches + 1 ) / ( # observations + 1 ). TableRecalibration accepts a --smoothing / -sm <int>
* argument which sets how many unobserved counts to add to every bin. Use --smoothing 0 to turn off all smoothing or, for example,
* --smoothing 15 for a large amount of smoothing.
*/
@Argument(fullName="smoothing", shortName="sm", required = false, doc="Number of imaginary counts to add to each bin in order to smooth out bins with few data points")
private int SMOOTHING = 1;
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default=50")
/**
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
*/
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores")
private int MAX_QUALITY_SCORE = 50;
/**
* By default TableRecalibration emits the OQ field -- so you can go back and look at the original quality scores, rerun
* the system using the OQ flags, etc, on the output BAM files; to turn off emission of the OQ field use this flag.
*/
@Argument(fullName="doNotWriteOriginalQuals", shortName="noOQs", required=false, doc="If true, we will not write the original quality (OQ) tag for each read")
private boolean DO_NOT_WRITE_OQ = false;

View File

@ -0,0 +1,463 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.walkers.validation;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.SampleUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.MutableVariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.Map;
import java.util.Set;
import static org.broadinstitute.sting.utils.IndelUtils.isInsideExtendedIndel;
/**
* Genotypes a dataset and validates the calls of another dataset using the Unified Genotyper.
*
* <p>
* Genotype and Validate is a tool to evaluate the quality of a dataset for calling SNPs
* and Indels given a secondary (validation) data source. The data sources are BAM or VCF
* files. You can use them interchangeably (i.e. a BAM to validate calls in a VCF or a VCF
* to validate calls on a BAM).
* </p>
*
* <p>
* The simplest scenario is when you have a VCF of hand annotated SNPs and Indels, and you
* want to know how well a particular technology performs calling these snps. With a
* dataset (BAM file) generated by the technology in test, and the hand annotated VCF, you
* can run GenotypeAndValidate to asses the accuracy of the calls with the new technology's
* dataset.
* </p>
*
* <p>
* Another option is to validate the calls on a VCF file, using a deep coverage BAM file
* that you trust the calls on. The GenotypeAndValidate walker will make calls using the
* reads in the BAM file and take them as truth, then compare to the calls in the VCF file
* and produce a truth table.
* </p>
*
*
* <h2>Input</h2>
* <p>
* A BAM file to make calls on and a VCF file to use as truth validation dataset.
*
* You also have the option to invert the roles of the files using the command line options listed below.
* </p>
*
* <h2>Output</h2>
* <p>
* GenotypeAndValidate has two outputs. The truth table and the optional VCF file. The truth table is a
* 2x2 table correlating what was called in the dataset with the truth of the call (whether it's a true
* positive or a false positive). The table should look like this:
* </p>
* <center>
* <table id="description-table">
* <tr>
* <th></th>
* <th>ALT</th>
* <th>REF</th>
* <th>Predictive Value</th>
* </tr>
* <tr>
* <td><b>called alt</b></td>
* <td>True Positive (TP)</td>
* <td>False Positive (FP)</td>
* <td>Positive PV</td>
* </tr>
* <tr>
* <td><b>called ref</b></td>
* <td>False Negative (FN)</td>
* <td>True Negative (TN)</td>
* <td>Negative PV</td>
* </tr>
* </table>
* </center>
*
* <p>
* The <b>positive predictive value (PPV)</b> is the proportion of subjects with positive test results
* who are correctly diagnosed.
* </p>
* <p>
* The <b>negative predictive value (NPV)</b> is the proportion of subjects with a negative test result
* who are correctly diagnosed.
* </p>
* <p>
* The VCF file will contain only the variants that were called or not called, excluding the ones that
* were uncovered or didn't pass the filters. This file is useful if you are trying to compare
* the PPV and NPV of two different technologies on the exact same sites (so you can compare apples to
* apples).
* </p>
*
* <p>
* Here is an example of an annotated VCF file (info field clipped for clarity)
*
* <pre>
* #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA12878
* 1 20568807 . C T 0 HapMapHet AC=1;AF=0.50;AN=2;DP=0;GV=T GT 0/1
* 1 22359922 . T C 282 WG-CG-HiSeq AC=2;AF=0.50;GV=T;AN=4;DP=42 GT:AD:DP:GL:GQ 1/0 ./. 0/1:20,22:39:-72.79,-11.75,-67.94:99 ./.
* 13 102391461 . G A 341 Indel;SnpCluster AC=1;GV=F;AF=0.50;AN=2;DP=45 GT:AD:DP:GL:GQ ./. ./. 0/1:32,13:45:-50.99,-13.56,-112.17:99 ./.
* 1 175516757 . C G 655 SnpCluster,WG AC=1;AF=0.50;AN=2;GV=F;DP=74 GT:AD:DP:GL:GQ ./. ./. 0/1:52,22:67:-89.02,-20.20,-191.27:99 ./.
* </pre>
*
* </p>
*
* <h3>Additional Details</h3>
* <ul>
* <li>
* You should always use -BTI on your VCF track, so that the GATK only looks at the sites on the VCF file.
* This speeds up the process a lot.
* </li>
* <li>
* The total number of visited bases may be greater than the number of variants in the original
* VCF file because of extended indels, as they trigger one call per new insertion or deletion.
* (i.e. ACTG/- will count as 4 genotyper calls, but it's only one line in the VCF).
* </li>
* </ul>
*
* <h2>Examples</h2>
* <ol>
* <li>
* Genotypes BAM file from new technology using the VCF as a truth dataset:
* </li>
*
* <pre>
* java
* -jar /GenomeAnalysisTK.jar
* -T GenotypeAndValidate
* -R human_g1k_v37.fasta
* -I myNewTechReads.bam
* -alleles handAnnotatedVCF.vcf
* -BTI alleles
* </pre>
*
* <li>
* Using a BAM file as the truth dataset:
* </li>
*
* <pre>
* java
* -jar /GenomeAnalysisTK.jar
* -T GenotypeAndValidate
* -R human_g1k_v37.fasta
* -I myTruthDataset.bam
* -alleles callsToValidate.vcf
* -BTI alleles
* -bt
* -o gav.vcf
* </pre>
*
*
* @author Mauricio Carneiro
* @since ${DATE}
*/
@Requires(value={DataSource.READS, DataSource.REFERENCE})
@Allows(value={DataSource.READS, DataSource.REFERENCE})
@By(DataSource.REFERENCE)
@Reference(window=@Window(start=-200,stop=200))
public class GenotypeAndValidateWalker extends RodWalker<GenotypeAndValidateWalker.CountedData, GenotypeAndValidateWalker.CountedData> implements TreeReducible<GenotypeAndValidateWalker.CountedData> {
/**
* The optional output file that will have all the variants used in the Genotype and Validation essay.
*/
@Output(doc="Generate a VCF file with the variants considered by the walker, with a new annotation \"callStatus\" which will carry the value called in the validation VCF or BAM file", required=false)
protected VCFWriter vcfWriter = null;
/**
* The callset to be used as truth (default) or validated (if BAM file is set to truth).
*/
@Input(fullName="alleles", shortName = "alleles", doc="The set of alleles at which to genotype", required=true)
public RodBinding<VariantContext> alleles;
/**
* Makes the Unified Genotyper calls to the BAM file the truth dataset and validates the alleles ROD binding callset.
*/
@Argument(fullName ="set_bam_truth", shortName ="bt", doc="Use the calls on the reads (bam file) as the truth dataset and validate the calls on the VCF", required=false)
private boolean bamIsTruth = false;
/**
* The minimum base quality score necessary for a base to be considered when calling a genotype. This argument is passed to the Unified Genotyper.
*/
@Argument(fullName="minimum_base_quality_score", shortName="mbq", doc="Minimum base quality score for calling a genotype", required=false)
private int mbq = -1;
/**
* The maximum deletion fraction allowed in a site for calling a genotype. This argument is passed to the Unified Genotyper.
*/
@Argument(fullName="maximum_deletion_fraction", shortName="deletions", doc="Maximum deletion fraction for calling a genotype", required=false)
private double deletions = -1;
/**
* the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls. This argument is passed to the Unified Genotyper.
*/
@Argument(fullName="standard_min_confidence_threshold_for_calling", shortName="stand_call_conf", doc="the minimum phred-scaled Qscore threshold to separate high confidence from low confidence calls", required=false)
private double callConf = -1;
/**
* the minimum phred-scaled Qscore threshold to emit low confidence calls. This argument is passed to the Unified Genotyper.
*/
@Argument(fullName="standard_min_confidence_threshold_for_emitting", shortName="stand_emit_conf", doc="the minimum phred-scaled Qscore threshold to emit low confidence calls", required=false)
private double emitConf = -1;
/**
* Only validate sites that have at least a given depth
*/
@Argument(fullName="condition_on_depth", shortName="depth", doc="Condition validation on a minimum depth of coverage by the reads", required=false)
private int minDepth = -1;
/**
* If your VCF or BAM file has more than one sample and you only want to validate one, use this parameter to choose it.
*/
@Hidden
@Argument(fullName ="sample", shortName ="sn", doc="Name of the sample to validate (in case your VCF/BAM has more than one sample)", required=false)
private String sample = "";
private UnifiedGenotyperEngine snpEngine;
private UnifiedGenotyperEngine indelEngine;
public static class CountedData {
private long nAltCalledAlt = 0L;
private long nAltCalledRef = 0L;
private long nRefCalledAlt = 0L;
private long nRefCalledRef = 0L;
private long nNotConfidentCalls = 0L;
private long nUncovered = 0L;
/**
* Adds the values of other to this, returning this
* @param other the other object
*/
public void add(CountedData other) {
nAltCalledAlt += other.nAltCalledAlt;
nAltCalledRef += other.nAltCalledRef;
nRefCalledAlt += other.nRefCalledAlt;
nRefCalledRef += other.nRefCalledRef;
nUncovered += other.nUncovered;
nNotConfidentCalls += other.nNotConfidentCalls;
}
}
//---------------------------------------------------------------------------------------------------------------
//
// initialize
//
//---------------------------------------------------------------------------------------------------------------
public void initialize() {
// Initialize VCF header
if (vcfWriter != null) {
Map<String, VCFHeader> header = VCFUtils.getVCFHeadersFromRodPrefix(getToolkit(), alleles.getName());
Set<String> samples = SampleUtils.getSampleList(header, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(header.values(), logger);
headerLines.add(new VCFHeaderLine("source", "GenotypeAndValidate"));
vcfWriter.writeHeader(new VCFHeader(headerLines, samples));
}
// Filling in SNP calling arguments for UG
UnifiedArgumentCollection uac = new UnifiedArgumentCollection();
uac.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES;
uac.alleles = alleles;
if (!bamIsTruth) uac.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES;
if (mbq >= 0) uac.MIN_BASE_QUALTY_SCORE = mbq;
if (deletions >= 0) uac.MAX_DELETION_FRACTION = deletions;
if (emitConf >= 0) uac.STANDARD_CONFIDENCE_FOR_EMITTING = emitConf;
if (callConf >= 0) uac.STANDARD_CONFIDENCE_FOR_CALLING = callConf;
uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP;
snpEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
// Adding the INDEL calling arguments for UG
uac.GLmodel = GenotypeLikelihoodsCalculationModel.Model.INDEL;
indelEngine = new UnifiedGenotyperEngine(getToolkit(), uac);
// make sure we have callConf set to the threshold set by the UAC so we can use it later.
callConf = uac.STANDARD_CONFIDENCE_FOR_CALLING;
}
//---------------------------------------------------------------------------------------------------------------
//
// map
//
//---------------------------------------------------------------------------------------------------------------
public CountedData map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {
final CountedData counter = new CountedData();
// For some reason RodWalkers get map calls with null trackers
if( tracker == null )
return counter;
VariantContext vcComp = tracker.getFirstValue(alleles);
if( vcComp == null )
return counter;
//todo - not sure I want this, may be misleading to filter extended indel events.
if (isInsideExtendedIndel(vcComp, ref))
return counter;
// Do not operate on variants that are not covered to the optional minimum depth
if (!context.hasReads() || (minDepth > 0 && context.getBasePileup().getBases().length < minDepth)) {
counter.nUncovered = 1L;
return counter;
}
VariantCallContext call;
if ( vcComp.isSNP() )
call = snpEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
else if ( vcComp.isIndel() ) {
call = indelEngine.calculateLikelihoodsAndGenotypes(tracker, ref, context);
}
else {
logger.info("Not SNP or INDEL " + vcComp.getChr() + ":" + vcComp.getStart() + " " + vcComp.getAlleles());
return counter;
}
boolean writeVariant = true;
if (bamIsTruth) {
if (call.confidentlyCalled) {
// If truth is a confident REF call
if (call.isVariant()) {
if (vcComp.isVariant())
counter.nAltCalledAlt = 1L; // todo -- may wanna check if the alts called are the same?
else
counter.nAltCalledRef = 1L;
}
// If truth is a confident ALT call
else {
if (vcComp.isVariant())
counter.nRefCalledAlt = 1L;
else
counter.nRefCalledRef = 1L;
}
}
else {
counter.nNotConfidentCalls = 1L;
writeVariant = false;
}
}
else {
if (!vcComp.hasAttribute("GV"))
throw new UserException.BadInput("Variant has no GV annotation in the INFO field. " + vcComp.getChr() + ":" + vcComp.getStart());
if (call.isCalledAlt(callConf)) {
if (vcComp.getAttribute("GV").equals("T"))
counter.nAltCalledAlt = 1L;
else
counter.nRefCalledAlt = 1L;
}
else if (call.isCalledRef(callConf)) {
if (vcComp.getAttribute("GV").equals("T"))
counter.nAltCalledRef = 1L;
else
counter.nRefCalledRef = 1L;
}
else {
counter.nNotConfidentCalls = 1L;
writeVariant = false;
}
}
if (vcfWriter != null && writeVariant) {
if (!vcComp.hasAttribute("callStatus")) {
MutableVariantContext mvc = new MutableVariantContext(vcComp);
mvc.putAttribute("callStatus", call.isCalledAlt(callConf) ? "ALT" : "REF" );
vcfWriter.add(mvc);
}
else
vcfWriter.add(vcComp);
}
return counter;
}
//---------------------------------------------------------------------------------------------------------------
//
// reduce
//
//---------------------------------------------------------------------------------------------------------------
public CountedData reduceInit() {
return new CountedData();
}
public CountedData treeReduce( final CountedData sum1, final CountedData sum2) {
sum2.add(sum1);
return sum2;
}
public CountedData reduce( final CountedData mapValue, final CountedData reduceSum ) {
reduceSum.add(mapValue);
return reduceSum;
}
public void onTraversalDone( CountedData reduceSum ) {
double ppv = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nRefCalledAlt));
double npv = 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nAltCalledRef));
double sensitivity = 100 * ((double) reduceSum.nAltCalledAlt /( reduceSum.nAltCalledAlt + reduceSum.nAltCalledRef));
double specificity = (reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt > 0) ? 100 * ((double) reduceSum.nRefCalledRef /( reduceSum.nRefCalledRef + reduceSum.nRefCalledAlt)) : 100;
logger.info(String.format("Resulting Truth Table Output\n\n" +
"---------------------------------------------------\n" +
"\t\t|\tALT\t|\tREF\t\n" +
"---------------------------------------------------\n" +
"called alt\t|\t%d\t|\t%d\n" +
"called ref\t|\t%d\t|\t%d\n" +
"---------------------------------------------------\n" +
"positive predictive value: %f%%\n" +
"negative predictive value: %f%%\n" +
"---------------------------------------------------\n" +
"sensitivity: %f%%\n" +
"specificity: %f%%\n" +
"---------------------------------------------------\n" +
"not confident: %d\n" +
"not covered: %d\n" +
"---------------------------------------------------\n", reduceSum.nAltCalledAlt, reduceSum.nRefCalledAlt, reduceSum.nAltCalledRef, reduceSum.nRefCalledRef, ppv, npv, sensitivity, specificity, reduceSum.nNotConfidentCalls, reduceSum.nUncovered));
}
}

View File

@ -14,9 +14,8 @@ import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.table.TableFeature;
import org.broadinstitute.sting.utils.codecs.table.TableFeature;
import org.broadinstitute.sting.gatk.walkers.DataSource;
import org.broadinstitute.sting.gatk.walkers.RMD;
import org.broadinstitute.sting.gatk.walkers.Requires;
import org.broadinstitute.sting.gatk.walkers.RodWalker;
import org.broadinstitute.sting.utils.BaseUtils;
@ -31,21 +30,77 @@ import java.util.LinkedList;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: chartl
* Date: 6/13/11
* Time: 2:12 PM
* To change this template use File | Settings | File Templates.
* Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation
*
* <p>
* ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe
* sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within
* the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies
* reasons why the site may fail validation (nearby variation, for example).
* </p>
*
* <h2>Input</h2>
* <p>
* Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an
* interval list defining the size of the amplicons around the sites to be validated
* </p>
*
* <h2>Output</h2>
* <p>
* Output is a FASTA-formatted file with some modifications at probe sites. For instance:
* <pre>
* >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414
* CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC
* >20:792122 Valid 20_792122
* TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT
* >20:994145 Valid 20_994145
* TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC
* >20:1074230 SITE_IS_FILTERED=1, 20_1074230
* ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC
* >20:1084330 DELETION=1, 20_1084330
* CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC
*</pre>
* are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be:
*
* Valid // amplicon is valid
* SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant")
* VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak
* MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon
* DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate
* DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak
* START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer
* END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer
* NO_VARIANTS_FOUND, // no variants found within the amplicon region
* INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself)
* </p>
*
* <h2>Examples</h2>
* <pre></pre>
* java
* -jar GenomeAnalysisTK.jar
* -T ValidationAmplicons
* -R /humgen/1kg/reference/human_g1k_v37.fasta
* -BTI ProbeIntervals
* -ProbeIntervals:table interval_table.table
* -ValidateAlleles:vcf sites_to_validate.vcf
* -MaskAlleles:vcf mask_sites.vcf
* --virtualPrimerSize 30
* -o probes.fasta
* </pre>
*
* @author chartl
* @since July 2011
*/
@Requires(value={DataSource.REFERENCE})
public class ValidationAmplicons extends RodWalker<Integer,Integer> {
@Input(fullName = "ProbeIntervals", doc="Chris document me", required=true)
@Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+
"intervals surrounding the probe sites amplicons should be designed for", required=true)
RodBinding<TableFeature> probeIntervals;
@Input(fullName = "ValidateAlleles", doc="Chris document me", required=true)
@Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true)
RodBinding<VariantContext> validateAlleles;
@Input(fullName = "MaskAlleles", doc="Chris document me", required=true)
@Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true)
RodBinding<VariantContext> maskAlleles;
@ -195,17 +250,17 @@ public class ValidationAmplicons extends RodWalker<Integer,Integer> {
} else /* (mask != null && validate == null ) */ {
if ( ! mask.isSNP() && ! mask.isFiltered() && ( ! filterMonomorphic || ! mask.isMonomorphic() )) {
logger.warn("Mask Variant Context on the following warning line is not a SNP. Currently we can only mask out SNPs. This probe will not be designed.");
logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
logger.warn(String.format("%s:%d-%d\t%s\t%s",mask.getChr(),mask.getStart(),mask.getEnd(),mask.isSimpleInsertion() ? "INS" : "DEL", Utils.join(",",mask.getAlleles())));
sequenceInvalid = true;
invReason.add(mask.isInsertion() ? "INSERTION" : "DELETION");
invReason.add(mask.isSimpleInsertion() ? "INSERTION" : "DELETION");
// note: indelCounter could be > 0 (could have small deletion within larger one). This always selects
// the larger event.
int indelCounterNew = mask.isInsertion() ? 2 : mask.getEnd()-mask.getStart();
int indelCounterNew = mask.isSimpleInsertion() ? 2 : mask.getEnd()-mask.getStart();
if ( indelCounterNew > indelCounter ) {
indelCounter = indelCounterNew;
}
//sequence.append((char) ref.getBase());
//sequence.append(mask.isInsertion() ? 'I' : 'D');
//sequence.append(mask.isSimpleInsertion() ? 'I' : 'D');
sequence.append("N");
indelCounter--;
rawSequence.append(Character.toUpperCase((char) ref.getBase()));

View File

@ -36,25 +36,66 @@ import java.util.*;
/**
* General-purpose tool for variant evaluation (% in dbSNP, genotype concordance, Ti/Tv ratios, and a lot more)
*
* <p>
* Given a variant callset, it is common to calculate various quality control metrics. These metrics include the number of
* raw or filtered SNP counts; ratio of transition mutations to transversions; concordance of a particular sample's calls
* to a genotyping chip; number of singletons per sample; etc. Furthermore, it is often useful to stratify these metrics
* by various criteria like functional class (missense, nonsense, silent), whether the site is CpG site, the amino acid
* degeneracy of the site, etc. VariantEval facilitates these calculations in two ways: by providing several built-in
* evaluation and stratification modules, and by providing a framework that permits the easy development of new evaluation
* and stratification modules.
*
* <h2>Input</h2>
* <p>
* One or more variant sets to evaluate plus any number of comparison sets.
* </p>
*
* <h2>Output</h2>
* <p>
* Evaluation tables.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantEval \
* -o output.eval.gatkreport \
* --eval:set1 set1.vcf \
* --eval:set2 set2.vcf \
* [--comp comp.vcf]
* </pre>
*
*/
@Reference(window=@Window(start=-50, stop=50))
public class VariantEvalWalker extends RodWalker<Integer, Integer> implements TreeReducible<Integer> {
// Output arguments
@Output
protected PrintStream out;
/**
* The variant file(s) to evaluate.
*/
@Input(fullName="eval", shortName = "eval", doc="Input evaluation file(s)", required=true)
public List<RodBinding<VariantContext>> evals;
/**
* The variant file(s) to compare against.
*/
@Input(fullName="comp", shortName = "comp", doc="Input comparison file(s)", required=false)
public List<RodBinding<VariantContext>> compsProvided = Collections.emptyList();
private List<RodBinding<VariantContext>> comps = new ArrayList<RodBinding<VariantContext>>();
/**
* dbSNP comparison VCF. By default, the dbSNP file is used to specify the set of "known" variants.
* Other sets can be specified with the -knownName (--known_names) argument.
*/
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
// Help arguments
@Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit")
@Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false)
protected Boolean LIST = false;
// Partitioning the data arguments
@ -67,8 +108,12 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="sample", shortName="sn", doc="Derive eval and comp contexts using only these sample genotypes, when genotypes are available in the original context", required=false)
protected Set<String> SAMPLE_EXPRESSIONS;
/**
* List of rod tracks to be used for specifying "known" variants other than dbSNP.
*/
@Argument(shortName="knownName", doc="Name of ROD bindings containing variant sites that should be treated as known when splitting eval rods into known and novel subsets", required=false)
protected String[] KNOWN_NAMES = {};
protected HashSet<String> KNOWN_NAMES = new HashSet<String>();
List<RodBinding<VariantContext>> knowns = new ArrayList<RodBinding<VariantContext>>();
// Stratification arguments
@Argument(fullName="stratificationModule", shortName="ST", doc="One or more specific stratification modules to apply to the eval track(s) (in addition to the standard stratifications, unless -noS is specified)", required=false)
@ -80,7 +125,9 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="onlyVariantsOfType", shortName="VT", doc="If provided, only variants of these types will be considered during the evaluation, in ", required=false)
protected Set<VariantContext.Type> typesToUse = null;
// Evaluator arguments
/**
* See the -list argument to view available modules.
*/
@Argument(fullName="evalModule", shortName="EV", doc="One or more specific eval modules to apply to the eval track(s) (in addition to the standard modules, unless -noE is specified)", required=false)
protected String[] MODULES_TO_USE = {};
@ -94,7 +141,10 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
@Argument(fullName="minPhaseQuality", shortName="mpq", doc="Minimum phasing quality", required=false)
protected double MIN_PHASE_QUALITY = 10.0;
@Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations: this argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
/**
* This argument is a string formatted as dad+mom=child where these parameters determine which sample names are examined.
*/
@Argument(shortName="family", doc="If provided, genotypes in will be examined for mendelian violations", required=false)
protected String FAMILY_STRUCTURE;
@Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
@ -108,9 +158,6 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// Variables
private Set<SortableJexlVCMatchExp> jexlExpressions = new TreeSet<SortableJexlVCMatchExp>();
private Set<String> compNames = new TreeSet<String>();
private Set<String> knownNames = new TreeSet<String>();
private Set<String> evalNames = new TreeSet<String>();
private Set<String> sampleNamesForEvaluation = new TreeSet<String>();
private Set<String> sampleNamesForStratification = new TreeSet<String>();
@ -149,23 +196,24 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
comps.addAll(compsProvided);
if ( dbsnp.dbsnp.isBound() ) {
comps.add(dbsnp.dbsnp);
knownNames.add(dbsnp.dbsnp.getName());
knowns.add(dbsnp.dbsnp);
}
// Add a dummy comp track if none exists
if ( comps.size() == 0 )
comps.add(new RodBinding<VariantContext>(VariantContext.class, "none", "UNBOUND", "", new Tags()));
// Cache the rod names
for ( RodBinding<VariantContext> compRod : comps )
compNames.add(compRod.getName());
// Set up set of additional knowns
for ( RodBinding<VariantContext> compRod : comps ) {
if ( KNOWN_NAMES.contains(compRod.getName()) )
knowns.add(compRod);
}
// Collect the eval rod names
Set<String> evalNames = new TreeSet<String>();
for ( RodBinding<VariantContext> evalRod : evals )
evalNames.add(evalRod.getName());
// Set up set of additional known names
knownNames.addAll(Arrays.asList(KNOWN_NAMES));
// Now that we have all the rods categorized, determine the sample list from the eval rods.
Map<String, VCFHeader> vcfRods = VCFUtils.getVCFHeadersFromRods(getToolkit(), evalNames);
Set<String> vcfSamples = SampleUtils.getSampleList(vcfRods, VariantContextUtils.GenotypeMergeType.REQUIRE_UNIQUE);
@ -263,7 +311,8 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
// for each comp track
for ( final RodBinding<VariantContext> compRod : comps ) {
// no sample stratification for comps
final Set<VariantContext> compSet = compVCs.get(compRod) == null ? new HashSet<VariantContext>(0) : compVCs.get(compRod).values().iterator().next();
final HashMap<String, Set<VariantContext>> compSetHash = compVCs.get(compRod);
final Set<VariantContext> compSet = (compSetHash == null || compSetHash.size() == 0) ? new HashSet<VariantContext>(0) : compVCs.get(compRod).values().iterator().next();
// find the comp
final VariantContext comp = findMatchingComp(eval, compSet);
@ -462,15 +511,15 @@ public class VariantEvalWalker extends RodWalker<Integer, Integer> implements Tr
public static String getAllSampleName() { return ALL_SAMPLE_NAME; }
public Set<String> getKnownNames() { return knownNames; }
public List<RodBinding<VariantContext>> getKnowns() { return knowns; }
public Set<String> getEvalNames() { return evalNames; }
public List<RodBinding<VariantContext>> getEvals() { return evals; }
public Set<String> getSampleNamesForEvaluation() { return sampleNamesForEvaluation; }
public Set<String> getSampleNamesForStratification() { return sampleNamesForStratification; }
public Set<String> getCompNames() { return compNames; }
public List<RodBinding<VariantContext>> getComps() { return comps; }
public Set<SortableJexlVCMatchExp> getJexlExpressions() { return jexlExpressions; }

View File

@ -39,8 +39,10 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
public long nInsertions = 0;
@DataPoint(description = "Number of deletions")
public long nDeletions = 0;
@DataPoint(description = "Number of complex loci")
@DataPoint(description = "Number of complex indels")
public long nComplex = 0;
@DataPoint(description = "Number of mixed loci (loci that can't be classified as a SNP, Indel or MNP)")
public long nMixed = 0;
@DataPoint(description = "Number of no calls loci")
@ -97,27 +99,35 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
// This is really not correct. What we really want here is a polymorphic vs. monomorphic count (i.e. on the Genotypes).
// So in order to maintain consistency with the previous implementation (and the intention of the original author), I've
// added in a proxy check for monomorphic status here.
if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() == vc1.getNSamples()) ) {
// Protect against case when vc only as no-calls too - can happen if we strafity by sample and sample as a single no-call.
if ( !vc1.isVariant() || (vc1.hasGenotypes() && vc1.getHomRefCount() + vc1.getNoCallCount() == vc1.getNSamples()) ) {
nRefLoci++;
} else {
nVariantLoci++;
switch (vc1.getType()) {
switch (vc1.getType()) {
case NO_VARIATION:
break;
case SNP:
nVariantLoci++;
nSNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
break;
case MNP:
nVariantLoci++;
nMNPs++;
if (vc1.getAttributeAsBoolean("ISSINGLETON")) nSingletons++;
break;
case INDEL:
if (vc1.isInsertion()) nInsertions++;
else nDeletions++;
nVariantLoci++;
if (vc1.isSimpleInsertion())
nInsertions++;
else if (vc1.isSimpleDeletion())
nDeletions++;
else
nComplex++;
break;
case MIXED:
nComplex++;
nVariantLoci++;
nMixed++;
break;
default:
throw new ReviewedStingException("Unexpected VariantContext type " + vc1.getType());
@ -180,8 +190,8 @@ public class CountVariants extends VariantEvaluator implements StandardEval {
heterozygosity = perLocusRate(nHets);
heterozygosityPerBp = perLocusRInverseRate(nHets);
hetHomRatio = ratio(nHets, nHomVar);
indelRate = perLocusRate(nDeletions + nInsertions);
indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions);
indelRate = perLocusRate(nDeletions + nInsertions + nComplex);
indelRatePerBp = perLocusRInverseRate(nDeletions + nInsertions + nComplex);
deletionInsertionRatio = ratio(nDeletions, nInsertions);
}
}

View File

@ -96,9 +96,9 @@ public class IndelLengthHistogram extends VariantEvaluator {
}
if ( vc1.isIndel() ) {
if ( vc1.isInsertion() ) {
if ( vc1.isSimpleInsertion() ) {
indelHistogram.update(vc1.getAlternateAllele(0).length());
} else if ( vc1.isDeletion() ) {
} else if ( vc1.isSimpleDeletion() ) {
indelHistogram.update(-vc1.getReference().length());
} else {
throw new ReviewedStingException("Indel type that is not insertion or deletion.");

View File

@ -1,221 +0,0 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.TableType;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* @author delangel
* @since Apr 11, 2010
*/
@Analysis(name = "Indel Metrics by allele count", description = "Shows various stats binned by allele count")
public class IndelMetricsByAC extends VariantEvaluator {
// a mapping from quality score histogram bin to Ti/Tv ratio
@DataPoint(description = "Indel Metrics by allele count")
IndelMetricsByAc metrics = null;
int numSamples = 0;
public void initialize(VariantEvalWalker walker) {
numSamples = walker.getNumSamples();
}
//@DataPoint(name="Quality by Allele Count", description = "average variant quality for each allele count")
//AlleleCountStats alleleCountStats = null;
private static final int INDEL_SIZE_LIMIT = 100;
private static final int NUM_SCALAR_COLUMNS = 6;
static int len2Index(int ind) {
return ind+INDEL_SIZE_LIMIT;
}
static int index2len(int ind) {
return ind-INDEL_SIZE_LIMIT-NUM_SCALAR_COLUMNS;
}
protected final static String[] METRIC_COLUMNS;
static {
METRIC_COLUMNS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1];
METRIC_COLUMNS[0] = "AC";
METRIC_COLUMNS[1] = "nIns";
METRIC_COLUMNS[2] = "nDels";
METRIC_COLUMNS[3] = "n";
METRIC_COLUMNS[4] = "nComplex";
METRIC_COLUMNS[5] = "nLong";
for (int k=NUM_SCALAR_COLUMNS; k < NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT+1; k++)
METRIC_COLUMNS[k] = "indel_size_len"+Integer.valueOf(index2len(k));
}
class IndelMetricsAtAC {
public int ac = -1, nIns =0, nDel = 0, nComplex = 0, nLong;
public int sizeCount[] = new int[2*INDEL_SIZE_LIMIT+1];
public IndelMetricsAtAC(int ac) { this.ac = ac; }
public void update(VariantContext eval) {
int eventLength = 0;
if ( eval.isInsertion() ) {
eventLength = eval.getAlternateAllele(0).length();
nIns++;
} else if ( eval.isDeletion() ) {
eventLength = -eval.getReference().length();
nDel++;
}
else {
nComplex++;
}
if (Math.abs(eventLength) < INDEL_SIZE_LIMIT)
sizeCount[len2Index(eventLength)]++;
else
nLong++;
}
// corresponding to METRIC_COLUMNS
public String getColumn(int i) {
if (i >= NUM_SCALAR_COLUMNS && i <=NUM_SCALAR_COLUMNS+ 2*INDEL_SIZE_LIMIT)
return String.valueOf(sizeCount[i-NUM_SCALAR_COLUMNS]);
switch (i) {
case 0: return String.valueOf(ac);
case 1: return String.valueOf(nIns);
case 2: return String.valueOf(nDel);
case 3: return String.valueOf(nIns + nDel);
case 4: return String.valueOf(nComplex);
case 5: return String.valueOf(nLong);
default:
throw new ReviewedStingException("Unexpected column " + i);
}
}
}
class IndelMetricsByAc implements TableType {
ArrayList<IndelMetricsAtAC> metrics = new ArrayList<IndelMetricsAtAC>();
Object[] rows = null;
public IndelMetricsByAc( int nchromosomes ) {
rows = new Object[nchromosomes+1];
metrics = new ArrayList<IndelMetricsAtAC>(nchromosomes+1);
for ( int i = 0; i < nchromosomes + 1; i++ ) {
metrics.add(new IndelMetricsAtAC(i));
rows[i] = "ac" + i;
}
}
public Object[] getRowKeys() {
return rows;
}
public Object[] getColumnKeys() {
return METRIC_COLUMNS;
}
public String getName() {
return "IndelMetricsByAc";
}
//
public String getCell(int ac, int y) {
return metrics.get(ac).getColumn(y);
}
public String toString() {
return "";
}
public void incrValue( VariantContext eval ) {
int ac = -1;
if ( eval.hasGenotypes() )
ac = eval.getChromosomeCount(eval.getAlternateAllele(0));
else if ( eval.hasAttribute("AC") ) {
ac = Integer.valueOf(eval.getAttributeAsString("AC"));
}
if ( ac != -1 )
metrics.get(ac).update(eval);
}
}
//public IndelMetricsByAC(VariantEvalWalker parent) {
//super(parent);
// don't do anything
//}
public String getName() {
return "IndelMetricsByAC";
}
public int getComparisonOrder() {
return 1; // we only need to see each eval track
}
public boolean enabled() {
return true;
}
public String toString() {
return getName();
}
public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
final String interesting = null;
if (eval != null ) {
if ( metrics == null ) {
int nSamples = numSamples;
//int nSamples = 2;
if ( nSamples != -1 )
metrics = new IndelMetricsByAc(2 * nSamples);
}
if ( eval.isIndel() && eval.isBiallelic() &&
metrics != null ) {
metrics.incrValue(eval);
}
}
return interesting; // This module doesn't capture any interesting sites, so return null
}
//public void finalizeEvaluation() {
//
//}
}

View File

@ -44,7 +44,7 @@ public class IndelStatistics extends VariantEvaluator {
@DataPoint(description = "Indel Statistics")
IndelStats indelStats = null;
@DataPoint(description = "Indel Classification")
// @DataPoint(description = "Indel Classification")
IndelClasses indelClasses = null;
int numSamples = 0;
@ -57,13 +57,13 @@ public class IndelStatistics extends VariantEvaluator {
private static final int IND_HET = 0;
private static final int IND_INS = 1;
private static final int IND_DEL = 2;
private static final int IND_AT_CG_RATIO = 3;
private static final int IND_COMPLEX = 3;
private static final int IND_HET_INS = 4;
private static final int IND_HOM_INS = 5;
private static final int IND_HET_DEL = 6;
private static final int IND_HOM_DEL = 7;
private static final int IND_HOM_REF = 8;
private static final int IND_COMPLEX = 9;
private static final int IND_MIXED = 9;
private static final int IND_LONG = 10;
private static final int IND_AT_EXP = 11;
private static final int IND_CG_EXP = 12;
@ -79,15 +79,14 @@ public class IndelStatistics extends VariantEvaluator {
}
static class IndelStats implements TableType {
protected final static String ALL_SAMPLES_KEY = "allSamples";
protected final static String[] COLUMN_KEYS;
protected final static String[] COLUMN_KEYS;
static {
COLUMN_KEYS= new String[NUM_SCALAR_COLUMNS+2*INDEL_SIZE_LIMIT+1];
COLUMN_KEYS[0] = "heterozygosity";
COLUMN_KEYS[1] = "insertions";
COLUMN_KEYS[2] = "deletions";
COLUMN_KEYS[3] = "AT_CG_expansion_ratio";
COLUMN_KEYS[3] = "complex";
COLUMN_KEYS[4] = "het_insertions";
COLUMN_KEYS[5] = "homozygous_insertions";
COLUMN_KEYS[6] = "het_deletions";
@ -104,13 +103,10 @@ public class IndelStatistics extends VariantEvaluator {
}
// map of sample to statistics
protected final HashMap<String, int[]> indelSummary = new HashMap<String, int[]>();
protected final int[] indelSummary;
public IndelStats(final VariantContext vc) {
indelSummary.put(ALL_SAMPLES_KEY, new int[COLUMN_KEYS.length]);
for( final String sample : vc.getGenotypes().keySet() ) {
indelSummary.put(sample, new int[COLUMN_KEYS.length]);
}
indelSummary = new int[COLUMN_KEYS.length];
}
/**
@ -118,19 +114,10 @@ public class IndelStatistics extends VariantEvaluator {
* @return one row per sample
*/
public Object[] getRowKeys() {
return indelSummary.keySet().toArray(new String[indelSummary.size()]);
return new String[]{"all"};
}
public Object getCell(int x, int y) {
final Object[] rowKeys = getRowKeys();
if (y == IND_AT_CG_RATIO) {
int at = indelSummary.get(rowKeys[x])[IND_AT_EXP];
int cg = indelSummary.get(rowKeys[x])[IND_CG_EXP];
return String.format("%4.2f",((double)at) / (Math.max(cg, 1)));
}
else
return String.format("%d",indelSummary.get(rowKeys[x])[y]);
return String.format("%d",indelSummary[y]);
}
/**
@ -160,96 +147,49 @@ public class IndelStatistics extends VariantEvaluator {
int eventLength = 0;
boolean isInsertion = false, isDeletion = false;
if ( vc.isInsertion() ) {
if ( vc.isSimpleInsertion() ) {
eventLength = vc.getAlternateAllele(0).length();
indelSummary.get(ALL_SAMPLES_KEY)[IND_INS]++;
indelSummary[IND_INS]++;
isInsertion = true;
} else if ( vc.isDeletion() ) {
indelSummary.get(ALL_SAMPLES_KEY)[IND_DEL]++;
} else if ( vc.isSimpleDeletion() ) {
indelSummary[IND_DEL]++;
eventLength = -vc.getReference().length();
isDeletion = true;
}
else {
indelSummary.get(ALL_SAMPLES_KEY)[IND_COMPLEX]++;
else if (vc.isComplexIndel()) {
indelSummary[IND_COMPLEX]++;
}
else if (vc.isMixed())
indelSummary[IND_MIXED]++;
if (IndelUtils.isATExpansion(vc,ref))
indelSummary.get(ALL_SAMPLES_KEY)[IND_AT_EXP]++;
indelSummary[IND_AT_EXP]++;
if (IndelUtils.isCGExpansion(vc,ref))
indelSummary.get(ALL_SAMPLES_KEY)[IND_CG_EXP]++;
indelSummary[IND_CG_EXP]++;
// make sure event doesn't overstep array boundaries
if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) {
indelSummary.get(ALL_SAMPLES_KEY)[len2Index(eventLength)]++;
if (eventLength % 3 != 0)
indelSummary.get(ALL_SAMPLES_KEY)[IND_FRAMESHIFT]++;
}
else
indelSummary.get(ALL_SAMPLES_KEY)[IND_LONG]++;
for( final String sample : vc.getGenotypes().keySet() ) {
if ( indelSummary.containsKey(sample) ) {
Genotype g = vc.getGenotype(sample);
boolean isVariant = (g.isCalled() && !g.isHomRef());
if (isVariant) {
// update ins/del count
if (isInsertion) {
indelSummary.get(sample)[IND_INS]++;
}
else if (isDeletion)
indelSummary.get(sample)[IND_DEL]++;
else
indelSummary.get(sample)[IND_COMPLEX]++;
// update histogram
if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) {
indelSummary.get(sample)[len2Index(eventLength)]++;
if (eventLength % 3 != 0)
indelSummary.get(sample)[IND_FRAMESHIFT]++;
}
else
indelSummary.get(sample)[IND_LONG]++;
if (g.isHet())
if (isInsertion)
indelSummary.get(sample)[IND_HET_INS]++;
else if (isDeletion)
indelSummary.get(sample)[IND_HET_DEL]++;
else
if (isInsertion)
indelSummary.get(sample)[IND_HOM_INS]++;
else if (isDeletion)
indelSummary.get(sample)[IND_HOM_DEL]++;
if (IndelUtils.isATExpansion(vc,ref))
indelSummary.get(sample)[IND_AT_EXP]++;
if (IndelUtils.isCGExpansion(vc,ref))
indelSummary.get(sample)[IND_CG_EXP]++;
}
else
indelSummary.get(sample)[IND_HOM_REF]++;
if (vc.isSimpleDeletion() || vc.isSimpleInsertion()) {
if (Math.abs(eventLength) < INDEL_SIZE_LIMIT) {
indelSummary[len2Index(eventLength)]++;
if (eventLength % 3 != 0)
indelSummary[IND_FRAMESHIFT]++;
}
else
indelSummary[IND_LONG]++;
}
}
}
static class IndelClasses implements TableType {
protected final static String ALL_SAMPLES_KEY = "allSamples";
protected final static String[] columnNames = IndelUtils.getIndelClassificationNames();
// map of sample to statistics
protected final HashMap<String, int[]> indelClassSummary = new HashMap<String, int[]>();
protected final int[] indelClassSummary;
public IndelClasses(final VariantContext vc) {
indelClassSummary.put(ALL_SAMPLES_KEY, new int[columnNames.length]);
for( final String sample : vc.getGenotypes().keySet() ) {
indelClassSummary.put(sample, new int[columnNames.length]);
}
indelClassSummary = new int[columnNames.length];
}
/**
@ -257,11 +197,10 @@ public class IndelStatistics extends VariantEvaluator {
* @return one row per sample
*/
public Object[] getRowKeys() {
return indelClassSummary.keySet().toArray(new String[indelClassSummary.size()]);
return new String[]{"all"};
}
public Object getCell(int x, int y) {
final Object[] rowKeys = getRowKeys();
return String.format("%d",indelClassSummary.get(rowKeys[x])[y]);
return String.format("%d",indelClassSummary[y]);
}
/**
@ -285,18 +224,7 @@ public class IndelStatistics extends VariantEvaluator {
}
private void incrementSampleStat(VariantContext vc, int index) {
indelClassSummary.get(ALL_SAMPLES_KEY)[index]++;
for( final String sample : vc.getGenotypes().keySet() ) {
if ( indelClassSummary.containsKey(sample) ) {
Genotype g = vc.getGenotype(sample);
boolean isVariant = (g.isCalled() && !g.isHomRef());
if (isVariant)
// update count
indelClassSummary.get(sample)[index]++;
}
}
indelClassSummary[index]++;
}
/*
* increment the specified value
@ -344,16 +272,13 @@ public class IndelStatistics extends VariantEvaluator {
if (eval != null ) {
if ( indelStats == null ) {
int nSamples = numSamples;
if ( nSamples != -1 )
indelStats = new IndelStats(eval);
indelStats = new IndelStats(eval);
}
if ( indelClasses == null ) {
indelClasses = new IndelClasses(eval);
}
if ( eval.isIndel() && eval.isBiallelic() ) {
if ( eval.isIndel() || eval.isMixed() ) {
if (indelStats != null )
indelStats.incrValue(eval, ref);

View File

@ -1,23 +1,25 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
import java.util.List;
public class AlleleCount extends VariantStratifier {
// needs to know the variant context
private ArrayList<String> states = new ArrayList<String>();
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
public void initialize() {
List<RodBinding<VariantContext>> evals = getVariantEvalWalker().getEvals();
// we can only work with a single eval VCF, and it must have genotypes
if ( evalNames.size() != 1 )
if ( evals.size() != 1 )
throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf");
// There are 2 x n sample chromosomes for diploids

View File

@ -2,19 +2,17 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
public class AlleleFrequency extends VariantStratifier {
// needs to know the variant context
private ArrayList<String> states;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
public void initialize() {
states = new ArrayList<String>();
for( double a = 0.000; a <= 1.005; a += 0.005 ) {
states.add(String.format("%.3f", a));

View File

@ -1,24 +1,20 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
public class CompRod extends VariantStratifier implements RequiredStratification {
// Needs to know the comp rods
private Set<String> compNames;
private ArrayList<String> states;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
this.compNames = compNames;
public void initialize() {
states = new ArrayList<String>();
states.addAll(compNames);
for ( RodBinding<VariantContext> rod : getVariantEvalWalker().getComps() )
states.add(rod.getName());
}
public ArrayList<String> getAllStates() {

View File

@ -2,20 +2,18 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
public class Contig extends VariantStratifier {
// needs to know the variant context
private ArrayList<String> states;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
public void initialize() {
states = new ArrayList<String>();
states.addAll(contigNames);
states.addAll(getVariantEvalWalker().getContigNames());
states.add("all");
}

View File

@ -2,11 +2,9 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
/**
* CpG is a stratification module for VariantEval that divides the input data by within/not within a CpG site
@ -24,7 +22,7 @@ public class CpG extends VariantStratifier {
private ArrayList<String> states;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
public void initialize() {
states = new ArrayList<String>();
states.add("all");
states.add("CpG");
@ -40,7 +38,7 @@ public class CpG extends VariantStratifier {
if (ref != null && ref.getBases() != null) {
String fwRefBases = new String(ref.getBases());
String leftFlank = fwRefBases.substring((fwRefBases.length()/2) - 1, (fwRefBases.length()/2) + 1);
//String leftFlank = fwRefBases.substring((fwRefBases.length()/2) - 1, (fwRefBases.length()/2) + 1);
String rightFlank = fwRefBases.substring((fwRefBases.length()/2), (fwRefBases.length()/2) + 2);
//if (leftFlank.equalsIgnoreCase("CG") || leftFlank.equalsIgnoreCase("GC") || rightFlank.equalsIgnoreCase("CG") || rightFlank.equalsIgnoreCase("GC")) {

View File

@ -2,13 +2,11 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
public class Degeneracy extends VariantStratifier {
private ArrayList<String> states;
@ -16,7 +14,7 @@ public class Degeneracy extends VariantStratifier {
private HashMap<String, HashMap<Integer, String>> degeneracies;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
public void initialize() {
states = new ArrayList<String>();
states.add("1-fold");
states.add("2-fold");

View File

@ -1,24 +1,20 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
public class EvalRod extends VariantStratifier implements RequiredStratification {
// needs to know the eval rods
private Set<String> evalNames;
private ArrayList<String> states;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
this.evalNames = evalNames;
public void initialize() {
states = new ArrayList<String>();
states.addAll(evalNames);
for ( RodBinding<VariantContext> rod : getVariantEvalWalker().getEvals() )
states.add(rod.getName());
}
public ArrayList<String> getAllStates() {

View File

@ -2,18 +2,16 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
public class Filter extends VariantStratifier {
// needs to know the variant context
private ArrayList<String> states;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
public void initialize() {
states = new ArrayList<String>();
states.add("called");
states.add("filtered");

View File

@ -2,18 +2,16 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
public class FunctionalClass extends VariantStratifier {
// needs to know the variant context
private ArrayList<String> states;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
public void initialize() {
states = new ArrayList<String>();
states.add("all");
states.add("silent");

View File

@ -15,8 +15,8 @@ public class JexlExpression extends VariantStratifier implements StandardStratif
private ArrayList<String> states;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
this.jexlExpressions = jexlExpressions;
public void initialize() {
jexlExpressions = getVariantEvalWalker().getJexlExpressions();
states = new ArrayList<String>();
states.add("none");

View File

@ -1,21 +1,21 @@
package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.*;
public class Novelty extends VariantStratifier implements StandardStratification {
// needs the variant contexts and known names
private Set<String> knownNames;
private List<RodBinding<VariantContext>> knowns;
final private ArrayList<String> states = new ArrayList<String>(Arrays.asList("all", "known", "novel"));
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
this.knownNames = knownNames;
public void initialize() {
knowns = getVariantEvalWalker().getKnowns();
}
public ArrayList<String> getAllStates() {
@ -24,13 +24,11 @@ public class Novelty extends VariantStratifier implements StandardStratification
public ArrayList<String> getRelevantStates(ReferenceContext ref, RefMetaDataTracker tracker, VariantContext comp, String compName, VariantContext eval, String evalName, String sampleName) {
if (tracker != null && eval != null) {
for (final String knownName : knownNames) {
final Collection<VariantContext> knownComps = tracker.getValues(VariantContext.class, knownName, ref.getLocus());
for ( final VariantContext c : knownComps ) {
// loop over sites, looking for something that matches the type eval
if ( eval.getType() == c.getType() ) {
return new ArrayList<String>(Arrays.asList("all", "known"));
}
final Collection<VariantContext> knownComps = tracker.getValues(knowns, ref.getLocus());
for ( final VariantContext c : knownComps ) {
// loop over sites, looking for something that matches the type eval
if ( eval.getType() == c.getType() ) {
return new ArrayList<String>(Arrays.asList("all", "known"));
}
}
}

View File

@ -2,20 +2,18 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
public class Sample extends VariantStratifier {
// needs the sample names
private ArrayList<String> samples;
@Override
public void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames) {
public void initialize() {
samples = new ArrayList<String>();
samples.addAll(sampleNames);
samples.addAll(getVariantEvalWalker().getSampleNamesForStratification());
}
public ArrayList<String> getAllStates() {

View File

@ -3,11 +3,9 @@ package org.broadinstitute.sting.gatk.walkers.varianteval.stratifications;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.varianteval.VariantEvalWalker;
import org.broadinstitute.sting.gatk.walkers.varianteval.util.SortableJexlVCMatchExp;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.util.ArrayList;
import java.util.Set;
public abstract class VariantStratifier implements Comparable {
private VariantEvalWalker variantEvalWalker;
@ -27,7 +25,7 @@ public abstract class VariantStratifier implements Comparable {
this.variantEvalWalker = variantEvalWalker;
}
public abstract void initialize(Set<SortableJexlVCMatchExp> jexlExpressions, Set<String> compNames, Set<String> knownNames, Set<String> evalNames, Set<String> sampleNames, Set<String> contigNames);
public abstract void initialize();
public ArrayList<String> getAllStates() {
return new ArrayList<String>();

View File

@ -103,7 +103,7 @@ public class VariantEvalUtils {
try {
VariantStratifier vs = c.newInstance();
vs.setVariantEvalWalker(variantEvalWalker);
vs.initialize(variantEvalWalker.getJexlExpressions(), variantEvalWalker.getCompNames(), variantEvalWalker.getKnownNames(), variantEvalWalker.getEvalNames(), variantEvalWalker.getSampleNamesForStratification(), variantEvalWalker.getContigNames());
vs.initialize();
strats.add(vs);
} catch (InstantiationException e) {
@ -347,9 +347,9 @@ public class VariantEvalUtils {
}
}
}
bindings.put(track, mapping);
}
bindings.put(track, mapping);
}
return bindings;

View File

@ -45,10 +45,43 @@ import java.io.FileNotFoundException;
import java.util.*;
/**
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel FDR levels which were specified during VariantRecalibration
* Applies cuts to the input vcf file (by adding filter lines) to achieve the desired novel truth sensitivity levels which were specified during VariantRecalibration
*
* <p>
* Using the tranche file generated by the previous step the ApplyRecalibration walker looks at each variant's VQSLOD value
* and decides which tranche it falls in. Variants in tranches that fall below the specified truth sensitivity filter level
* have their filter field annotated with its tranche level. This will result in a call set that simultaneously is filtered
* to the desired level but also has the information necessary to pull out more variants for a higher sensitivity but a
* slightly lower quality level.
*
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* The input raw variants to be recalibrated.
* <p>
* The recalibration table file in CSV format that was generated by the VariantRecalibrator walker.
* <p>
* The tranches file that was generated by the VariantRecalibrator walker.
*
* <h2>Output</h2>
* <p>
* A recalibrated VCF file in which each variant is annotated with its VQSLOD and filtered if the score is below the desired quality level.
*
* <h2>Examples</h2>
* <pre>
* java -Xmx3g -jar GenomeAnalysisTK.jar \
* -T ApplyRecalibration \
* -R reference/human_g1k_v37.fasta \
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
* --ts_filter_level 99.0 \
* -tranchesFile path/to/output.tranches \
* -recalFile path/to/output.recal \
* -o path/to/output.recalibrated.filtered.vcf
* </pre>
*
* @author rpoplin
* @since Mar 14, 2011
*/
public class ApplyRecalibration extends RodWalker<Integer, Integer> {
@ -57,11 +90,11 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
// Inputs
/////////////////////////////
/**
* The raw input variants to be recalibrated.
* These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
*/
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input;
@Input(fullName="recal_file", shortName="recalFile", doc="The output recal file used by ApplyRecalibration", required=true)
@Input(fullName="recal_file", shortName="recalFile", doc="The input recal file used by ApplyRecalibration", required=true)
private File RECAL_FILE;
@Input(fullName="tranches_file", shortName="tranchesFile", doc="The input tranches file describing where to cut the data", required=true)
private File TRANCHES_FILE;
@ -69,7 +102,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
/////////////////////////////
// Outputs
/////////////////////////////
@Output( doc="The output filtered, recalibrated VCF file", required=true)
@Output( doc="The output filtered and recalibrated VCF file in which each variant is annotated with its VQSLOD value", required=true)
private VCFWriter vcfWriter = null;
/////////////////////////////
@ -77,7 +110,7 @@ public class ApplyRecalibration extends RodWalker<Integer, Integer> {
/////////////////////////////
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering", required=false)
private double TS_FILTER_LEVEL = 99.0;
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName = "mode", shortName = "mode", doc = "Recalibration mode to employ: 1.) SNP for recalibrating only SNPs (emitting indels untouched in the output VCF); 2.) INDEL for indels; and 3.) BOTH for recalibrating both SNPs and indels simultaneously.", required = false)
public VariantRecalibratorArgumentCollection.Mode MODE = VariantRecalibratorArgumentCollection.Mode.SNP;

View File

@ -207,6 +207,7 @@ public class GaussianMixtureModel {
for( final boolean isNull : datum.isNull ) {
if( isNull ) { return evaluateDatumMarginalized( datum ); }
}
// Fill an array with the log10 probability coming from each Gaussian and then use MathUtils to sum them up correctly
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
int gaussianIndex = 0;
for( final MultivariateGaussian gaussian : gaussians ) {
@ -215,6 +216,7 @@ public class GaussianMixtureModel {
return MathUtils.log10sumLog10(pVarInGaussianLog10); // Sum(pi_k * p(v|n,k))
}
// Used only to decide which covariate dimension is most divergent in order to report in the culprit info field annotation
public Double evaluateDatumInOneDimension( final VariantDatum datum, final int iii ) {
if(datum.isNull[iii]) { return null; }
@ -229,7 +231,7 @@ public class GaussianMixtureModel {
}
public double evaluateDatumMarginalized( final VariantDatum datum ) {
int numSamples = 0;
int numRandomDraws = 0;
double sumPVarInGaussian = 0.0;
final int numIterPerMissingAnnotation = 10; // Trade off here between speed of computation and accuracy of the marginalization
final double[] pVarInGaussianLog10 = new double[gaussians.size()];
@ -248,10 +250,10 @@ public class GaussianMixtureModel {
// add this sample's probability to the pile in order to take an average in the end
sumPVarInGaussian += Math.pow(10.0, MathUtils.log10sumLog10(pVarInGaussianLog10)); // p = 10 ^ Sum(pi_k * p(v|n,k))
numSamples++;
numRandomDraws++;
}
}
}
return Math.log10( sumPVarInGaussian / ((double) numSamples) );
return Math.log10( sumPVarInGaussian / ((double) numRandomDraws) );
}
}

View File

@ -233,13 +233,15 @@ public class VariantDataManager {
}
public void parseTrainingSets( final RefMetaDataTracker tracker, final GenomeLoc genomeLoc, final VariantContext evalVC, final VariantDatum datum, final boolean TRUST_ALL_POLYMORPHIC, final HashMap<String, Double> rodToPriorMap,
final List<RodBinding<VariantContext>> training, final List<RodBinding<VariantContext>> truth, final List<RodBinding<VariantContext>> known, final List<RodBinding<VariantContext>> badSites) {
final List<RodBinding<VariantContext>> training, final List<RodBinding<VariantContext>> truth, final List<RodBinding<VariantContext>> known, final List<RodBinding<VariantContext>> badSites, final List<RodBinding<VariantContext>> resource) {
datum.isKnown = false;
datum.atTruthSite = false;
datum.atTrainingSite = false;
datum.atAntiTrainingSite = false;
datum.prior = 2.0;
//BUGBUG: need to clean this up
for( final RodBinding<VariantContext> rod : training ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
@ -264,6 +266,13 @@ public class VariantDataManager {
}
}
}
for( final RodBinding<VariantContext> rod : resource ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( isValidVariant( evalVC, trainVC, TRUST_ALL_POLYMORPHIC ) ) {
datum.prior = Math.max( datum.prior, (rodToPriorMap.containsKey(rod.getName()) ? rodToPriorMap.get(rod.getName()) : 0.0) );
}
}
}
for( final RodBinding<VariantContext> rod : badSites ) {
for( final VariantContext trainVC : tracker.getValues(rod, genomeLoc) ) {
if( trainVC != null ) {

View File

@ -45,10 +45,54 @@ import java.io.PrintStream;
import java.util.*;
/**
* Takes variant calls as .vcf files, learns a Gaussian mixture model over the variant annotations and evaluates the variant -- assigning an informative lod score
* Create a Gaussian mixture model by looking at the annotations values over a high quality subset of the input call set and then evaluate all input variants.
*
* <p>
* This walker is the first pass in a two-stage processing step. This walker is designed to be used in conjunction with ApplyRecalibration walker.
*
* <p>
* The purpose of the variant recalibrator is to assign a well-calibrated probability to each variant call in a call set.
* One can then create highly accurate call sets by filtering based on this single estimate for the accuracy of each call.
* The approach taken by variant quality score recalibration is to develop a continuous, covarying estimate of the relationship
* between SNP call annotations (QD, SB, HaplotypeScore, HRun, for example) and the the probability that a SNP is a true genetic
* variant versus a sequencing or data processing artifact. This model is determined adaptively based on "true sites" provided
* as input, typically HapMap 3 sites and those sites found to be polymorphic on the Omni 2.5M SNP chip array. This adaptive
* error model can then be applied to both known and novel variation discovered in the call set of interest to evaluate the
* probability that each call is real. The score that gets added to the INFO field of each variant is called the VQSLOD. It is
* the log odds ratio of being a true variant versus being false under the trained Gaussian mixture model.
*
* <p>
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
* http://www.broadinstitute.org/gsa/wiki/index.php/Variant_quality_score_recalibration
*
* <h2>Input</h2>
* <p>
* The input raw variants to be recalibrated.
* <p>
* Known, truth, and training sets to be used by the algorithm. How these various sets are used is described below.
*
* <h2>Output</h2>
* <p>
* A recalibration table file in CSV format that is used by the ApplyRecalibration walker.
* <p>
* A tranches file which shows various metrics of the recalibration callset as a function of making several slices through the data.
*
* <h2>Examples</h2>
* <pre>
* java -Xmx4g -jar GenomeAnalysisTK.jar \
* -T VariantRecalibrator \
* -R reference/human_g1k_v37.fasta \
* -input NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf \
* -truth:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=15.0 hapmap_3.3.b37.sites.vcf \
* -training:prior=12.0 1000G_omni2.5.b37.sites.vcf \
* -known:prior=8.0 dbsnp_132.b37.vcf \
* -an QD -an HaplotypeScore -an MQRankSum -an ReadPosRankSum -an FS -an MQ \
* -recalFile path/to/output.recal \
* -tranchesFile path/to/output.tranches \
* -rscriptFile path/to/output.plots.R
* </pre>
*
* User: rpoplin
* Date: 3/12/11
*/
public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDatum>, ExpandingArrayList<VariantDatum>> implements TreeReducible<ExpandingArrayList<VariantDatum>> {
@ -62,42 +106,44 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
// Inputs
/////////////////////////////
/**
* The raw input variants to be recalibrated.
* These calls should be unfiltered and annotated with the error covariates that are intended to use for modeling.
*/
@Input(fullName="input", shortName = "input", doc="The raw input variants to be recalibrated", required=true)
public List<RodBinding<VariantContext>> input;
/**
* A list of training variants used to train the Gaussian mixture model.
*
* Input variants which are found to overlap with these training sites are used to build the Gaussian mixture model.
*/
@Input(fullName="training", shortName = "training", doc="A list of training variants used to train the Gaussian mixture model", required=true)
public List<RodBinding<VariantContext>> training;
/**
* A list of true variants to be used when deciding the truth sensitivity cut of the final callset.
*
* When deciding where to set the cutoff in VQSLOD sensitivity to these truth sites is used.
* Typically one might want to say I dropped my threshold until I got back 99% of HapMap sites, for example.
*/
@Input(fullName="truth", shortName = "truth", doc="A list of true variants to be used when deciding the truth sensitivity cut of the final callset", required=true)
public List<RodBinding<VariantContext>> truth;
/**
* A list of known variants to be used for metric comparison purposes.
*
* The known / novel status of a variant isn't used by the algorithm itself and is only used for reporting / display purposes.
* The output metrics are stratified by known status in order to aid in comparisons with other call sets.
*/
@Input(fullName="known", shortName = "known", doc="A list of known variants to be used for metric comparison purposes", required=false)
public List<RodBinding<VariantContext>> known = Collections.emptyList();
/**
* A list of known bad variants used to supplement training the negative model.
*
* In addition to using the worst 3% of variants as compared to the Gaussian mixture model, we can also supplement the list
* with a database of known bad variants. Maybe these are loci which are frequently filtered out in many projects (centromere, for example).
*/
@Input(fullName="badSites", shortName = "badSites", doc="A list of known bad variants used to supplement training the negative model", required=false)
public List<RodBinding<VariantContext>> badSites = Collections.emptyList();
/**
* Any set of sites for which you would like to apply a prior probability but for which you don't want to use as training, truth, or known sites.
*/
@Input(fullName="resource", shortName = "resource", doc="A list of sites for which to apply a prior probability of being correct but which aren't used by the algorithm", required=false)
public List<RodBinding<VariantContext>> resource = Collections.emptyList();
/////////////////////////////
// Outputs
/////////////////////////////
@ -109,13 +155,29 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
/////////////////////////////
// Additional Command Line Arguments
/////////////////////////////
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
/**
* The expected transition / tranversion ratio of true novel variants in your targeted region (whole genome, exome, specific
* genes), which varies greatly by the CpG and GC content of the region. See expected Ti/Tv ratios section of the GATK best
* practices wiki documentation for more information. Normal whole genome values are 2.15 and for whole exome 3.2. Note
* that this parameter is used for display purposes only and isn't used anywhere in the algorithm!
*/
@Argument(fullName="target_titv", shortName="titv", doc="The expected novel Ti/Tv ratio to use when calculating FDR tranches and for display on the optimization curve output figures. (approx 2.15 for whole genome experiments). ONLY USED FOR PLOTTING PURPOSES!", required=false)
private double TARGET_TITV = 2.15;
/**
* See the input VCF file's INFO field for a list of all available annotations.
*/
@Argument(fullName="use_annotation", shortName="an", doc="The names of the annotations which should used for calculations", required=true)
private String[] USE_ANNOTATIONS = null;
/**
* Add truth sensitivity slices through the call set at the given values. The default values are 100.0, 99.9, 99.0, and 90.0
* which will result in 4 estimated tranches in the final call set: the full set of calls (100% sensitivity at the accessible
* sites in the truth set), a 99.9% truth sensitivity tranche, along with progressively smaller tranches at 99% and 90%.
*/
@Argument(fullName="TStranche", shortName="tranche", doc="The levels of novel false discovery rate (FDR, implied by ti/tv) at which to slice the data. (in percent, that is 1.0 for 1 percent)", required=false)
private double[] TS_TRANCHES = new double[] {100.0, 99.9, 99.0, 90.0};
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the optimizer will use variants even if the specified filter name is marked in the input VCF file", required=false)
@Argument(fullName="ignore_filter", shortName="ignoreFilter", doc="If specified the variant recalibrator will use variants even if the specified filter name is marked in the input VCF file", required=false)
private String[] IGNORE_INPUT_FILTERS = null;
@Argument(fullName="path_to_Rscript", shortName = "Rscript", doc = "The path to your implementation of Rscript. For Broad users this is maybe /broad/tools/apps/R-2.6.0/bin/Rscript", required=false)
private String PATH_TO_RSCRIPT = "Rscript";
@ -123,7 +185,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
private String RSCRIPT_FILE = null;
@Argument(fullName = "path_to_resources", shortName = "resources", doc = "Path to resources folder holding the Sting R scripts.", required=false)
private String PATH_TO_RESOURCES = "public/R/";
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in plots", required=false)
@Argument(fullName="ts_filter_level", shortName="ts_filter_level", doc="The truth sensitivity level at which to start filtering, used here to indicate filtered variants in the model reporting plots", required=false)
private double TS_FILTER_LEVEL = 99.0;
/////////////////////////////
@ -170,6 +232,7 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
allInputBindings.addAll(training);
allInputBindings.addAll(known);
allInputBindings.addAll(badSites);
allInputBindings.addAll(resource);
for( final RodBinding<VariantContext> rod : allInputBindings ) {
try {
rodToPriorMap.put(rod.getName(), (rod.getTags().containsKey("prior") ? Double.parseDouble(rod.getTags().getValue("prior")) : 0.0) );
@ -207,9 +270,9 @@ public class VariantRecalibrator extends RodWalker<ExpandingArrayList<VariantDat
datum.isTransition = datum.isSNP && VariantContextUtils.isTransition(vc);
// Loop through the training data sets and if they overlap this loci then update the prior and training status appropriately
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC, rodToPriorMap, training, truth, known, badSites );
dataManager.parseTrainingSets( tracker, context.getLocation(), vc, datum, TRUST_ALL_POLYMORPHIC, rodToPriorMap, training, truth, known, badSites, resource ); // BUGBUG: need to clean this up to be a class, not a list of all the rod bindings
double priorFactor = QualityUtils.qualToProb( datum.prior );
//if( PERFORM_PROJECT_CONSENSUS ) {
//if( PERFORM_PROJECT_CONSENSUS ) { // BUGBUG: need to resurrect this functionality?
// final double consensusPrior = QualityUtils.qualToProb( 1.0 + 5.0 * datum.consensusCount );
// priorFactor = 1.0 - ((1.0 - priorFactor) * (1.0 - consensusPrior));
//}

View File

@ -53,14 +53,14 @@ public class VariantRecalibratorArgumentCollection {
public double STD_THRESHOLD = 14.0;
@Argument(fullName="qualThreshold", shortName="qual", doc="If a known variant has raw QUAL value less than -qual then don't use it for building the Gaussian mixture model.", required=false)
public double QUAL_THRESHOLD = 80.0;
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in variational Bayes algorithm.", required=false)
@Argument(fullName="shrinkage", shortName="shrinkage", doc="The shrinkage parameter in the variational Bayes algorithm.", required=false)
public double SHRINKAGE = 1.0;
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in variational Bayes algorithm.", required=false)
@Argument(fullName="dirichlet", shortName="dirichlet", doc="The dirichlet parameter in the variational Bayes algorithm.", required=false)
public double DIRICHLET_PARAMETER = 0.001;
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in variational Bayes algorithm.", required=false)
@Argument(fullName="priorCounts", shortName="priorCounts", doc="The number of prior counts to use in the variational Bayes algorithm.", required=false)
public double PRIOR_COUNTS = 20.0;
@Argument(fullName="percentBadVariants", shortName="percentBad", doc="What percentage of the worst scoring variants to use when building the Gaussian mixture model of bad variants. 0.07 means bottom 7 percent.", required=false)
public double PERCENT_BAD_VARIANTS = 0.03;
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad arugment if necessary.", required=false)
@Argument(fullName="minNumBadVariants", shortName="minNumBad", doc="The minimum amount of worst scoring variants to use when building the Gaussian mixture model of bad variants. Will override -percentBad argument if necessary.", required=false)
public int MIN_NUM_BAD_VARIANTS = 2000;
}

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.gatk.walkers.variantrecalibration;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.util.List;
@ -43,6 +44,7 @@ public class VariantRecalibratorEngine {
/////////////////////////////
protected final static Logger logger = Logger.getLogger(VariantRecalibratorEngine.class);
public final static double MIN_ACCEPTABLE_LOD_SCORE = -20000.0;
// the unified argument collection
final private VariantRecalibratorArgumentCollection VRAC;
@ -72,13 +74,14 @@ public class VariantRecalibratorEngine {
for( final VariantDatum datum : data ) {
final double thisLod = evaluateDatum( datum, model );
if( Double.isNaN(thisLod) ) {
if( evaluateContrastively ) {
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)");
} else {
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe.");
}
throw new UserException("NaN LOD value assigned. Clustering with this few variants and these annotations is unsafe. Please consider raising the number of variants used to train the negative model (via --percentBadVariants 0.05, for example) or lowering the maximum number of Gaussians to use in the model (via --maxGaussians 4, for example)");
}
datum.lod = ( evaluateContrastively ? (datum.prior + datum.lod - thisLod) : thisLod );
datum.lod = ( evaluateContrastively ?
( Double.isInfinite(datum.lod) ? // positive model said negative infinity
( MIN_ACCEPTABLE_LOD_SCORE + GenomeAnalysisEngine.getRandomGenerator().nextDouble() * MIN_ACCEPTABLE_LOD_SCORE ) // Negative infinity lod values are possible when covariates are extremely far away from their tight Gaussians
: datum.prior + datum.lod - thisLod) // contrastive evaluation: (prior + positive model - negative model)
: thisLod ); // positive model only so set the lod and return
}
}

View File

@ -43,10 +43,54 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
* Combines VCF records from different sources; supports both full merges and set unions.
* Combines VCF records from different sources.
*
* <p>
* CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data
* and any number of sources can be input. This tool currently supports two different combination types for each of
* variants (the first 8 fields of the VCF) and genotypes (the rest).
* Merge: combines multiple records into a single one; if sample names overlap then they are uniquified.
* Union: assumes each rod represents the same set of samples (although this is not enforced); using the
* priority list (if provided), emits a single record instance at every position represented in the rods.
* priority list (if provided), it emits a single record instance at every position represented in the rods.
*
* CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD
* bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect,
* CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs
* can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just
* the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single
* VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
* in the detailed example on the wiki.
*
* <h2>Input</h2>
* <p>
* One or more variant sets to combine.
* </p>
*
* <h2>Output</h2>
* <p>
* A combined VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CombineVariants \
* --variant input1.vcf \
* --variant input2.vcf \
* -o output.vcf \
* -genotypeMergeOptions UNIQUIFY
*
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CombineVariants \
* --variant:foo input1.vcf \
* --variant:bar input2.vcf \
* -o output.vcf \
* -genotypeMergeOptions PRIORITIZE
* -priority foo,bar
* </pre>
*
*/
@Reference(window=@Window(start=-50,stop=50))
public class CombineVariants extends RodWalker<Integer, Integer> {
@ -69,32 +113,43 @@ public class CombineVariants extends RodWalker<Integer, Integer> {
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
// the types of combinations we currently allow
@Argument(shortName="genotypeMergeOptions", doc="How should we merge genotype records for samples shared across the ROD files?", required=false)
@Argument(shortName="genotypeMergeOptions", doc="Determines how we should merge genotype records for samples shared across the ROD files", required=false)
public VariantContextUtils.GenotypeMergeType genotypeMergeOption = VariantContextUtils.GenotypeMergeType.PRIORITIZE;
@Argument(shortName="filteredRecordsMergeType", doc="How should we deal with records seen at the same site in the VCF, but with different FILTER fields? KEEP_IF_ANY_UNFILTERED PASSes the record if any record is unfiltered, KEEP_IF_ALL_UNFILTERED requires all records to be unfiltered", required=false)
@Argument(shortName="filteredRecordsMergeType", doc="Determines how we should handle records seen at the same site in the VCF, but with different FILTER fields", required=false)
public VariantContextUtils.FilteredRecordMergeType filteredRecordsMergeType = VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED;
@Argument(fullName="rod_priority_list", shortName="priority", doc="When taking the union of variants containing genotypes: a comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted; a complete priority list MUST be provided", required=false)
/**
* Used when taking the union of variants that contain genotypes. A complete priority list MUST be provided.
*/
@Argument(fullName="rod_priority_list", shortName="priority", doc="A comma-separated string describing the priority ordering for the genotypes as far as which record gets emitted", required=false)
public String PRIORITY_STRING = null;
@Argument(fullName="printComplexMerges", shortName="printComplexMerges", doc="Print out interesting sites requiring complex compatibility merging", required=false)
public boolean printComplexMerges = false;
@Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotation don't appear in the combined VCF", required=false)
@Argument(fullName="filteredAreUncalled", shortName="filteredAreUncalled", doc="If true, then filtered VCFs are treated as uncalled, so that filtered set annotations don't appear in the combined VCF", required=false)
public boolean filteredAreUncalled = false;
@Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype INFO field", required=false)
/**
* Used to generate a sites-only file.
*/
@Argument(fullName="minimalVCF", shortName="minimalVCF", doc="If true, then the output VCF will contain no INFO or genotype FORMAT fields", required=false)
public boolean minimalVCF = false;
@Argument(fullName="setKey", shortName="setKey", doc="Key, by default set, in the INFO key=value tag emitted describing which set the combined VCF record came from. Set to null if you don't want the set field emitted.", required=false)
/**
* Set to 'null' if you don't want the set field emitted.
*/
@Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false)
public String SET_KEY = "set";
@Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls so that one can simply perform a merge sort to combine the VCFs into one, drastically reducing the runtime.", required=false)
/**
* This option allows the user to perform a simple merge (concatenation) to combine the VCFs, drastically reducing the runtime..
*/
@Argument(fullName="assumeIdenticalSamples", shortName="assumeIdenticalSamples", doc="If true, assume input VCFs have identical sample sets and disjoint calls", required=false)
public boolean ASSUME_IDENTICAL_SAMPLES = false;
@Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if variant is present in at least N input files.", required=false)
@Argument(fullName="minimumN", shortName="minN", doc="Combine variants and output site only if the variant is present in at least N input files.", required=false)
public int minimumN = 1;
@Hidden

View File

@ -46,6 +46,31 @@ import java.util.*;
/**
* Left-aligns indels from a variants file.
*
* <p>
* LeftAlignVariants is a tool that takes a VCF file and left-aligns any indels inside it. The same indel can often be
* placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to
* place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them.
*
* <h2>Input</h2>
* <p>
* A variant set to left-align.
* </p>
*
* <h2>Output</h2>
* <p>
* A left-aligned VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T LeftAlignVariants \
* --variant input.vcf \
* -o output.vcf
* </pre>
*
*/
@Reference(window=@Window(start=-200,stop=200))
public class LeftAlignVariants extends RodWalker<Integer, Integer> {
@ -108,7 +133,7 @@ public class LeftAlignVariants extends RodWalker<Integer, Integer> {
// get the indel length
int indelLength;
if ( vc.isDeletion() )
if ( vc.isSimpleDeletion() )
indelLength = vc.getReference().length();
else
indelLength = vc.getAlternateAllele(0).length();
@ -125,7 +150,7 @@ public class LeftAlignVariants extends RodWalker<Integer, Integer> {
// create a CIGAR string to represent the event
ArrayList<CigarElement> elements = new ArrayList<CigarElement>();
elements.add(new CigarElement(originalIndex, CigarOperator.M));
elements.add(new CigarElement(indelLength, vc.isDeletion() ? CigarOperator.D : CigarOperator.I));
elements.add(new CigarElement(indelLength, vc.isSimpleDeletion() ? CigarOperator.D : CigarOperator.I));
elements.add(new CigarElement(refSeq.length - originalIndex, CigarOperator.M));
Cigar originalCigar = new Cigar(elements);
@ -140,8 +165,8 @@ public class LeftAlignVariants extends RodWalker<Integer, Integer> {
int indelIndex = originalIndex-difference;
byte[] newBases = new byte[indelLength];
System.arraycopy((vc.isDeletion() ? refSeq : originalIndel), indelIndex, newBases, 0, indelLength);
Allele newAllele = Allele.create(newBases, vc.isDeletion());
System.arraycopy((vc.isSimpleDeletion() ? refSeq : originalIndel), indelIndex, newBases, 0, indelLength);
Allele newAllele = Allele.create(newBases, vc.isSimpleDeletion());
newVC = updateAllele(newVC, newAllele, refSeq[indelIndex-1]);
writer.add(newVC);
@ -153,14 +178,14 @@ public class LeftAlignVariants extends RodWalker<Integer, Integer> {
}
private static byte[] makeHaplotype(VariantContext vc, byte[] ref, int indexOfRef, int indelLength) {
byte[] hap = new byte[ref.length + (indelLength * (vc.isDeletion() ? -1 : 1))];
byte[] hap = new byte[ref.length + (indelLength * (vc.isSimpleDeletion() ? -1 : 1))];
// add the bases before the indel
System.arraycopy(ref, 0, hap, 0, indexOfRef);
int currentPos = indexOfRef;
// take care of the indel
if ( vc.isDeletion() ) {
if ( vc.isSimpleDeletion() ) {
indexOfRef += indelLength;
} else {
System.arraycopy(vc.getAlternateAllele(0).getBases(), 0, hap, currentPos, indelLength);

View File

@ -50,54 +50,184 @@ import java.io.PrintStream;
import java.util.*;
/**
* Takes a VCF file, selects variants based on sample(s) in which it was found and/or on various annotation criteria,
* recompute the value of certain annotations based on the new sample set, and output a new VCF with the results.
* Selects variants from a VCF source.
*
* <p>
* Often, a VCF containing many samples and/or variants will need to be subset in order to facilitate certain analyses
* (e.g. comparing and contrasting cases vs. controls; extracting variant or non-variant loci that meet certain
* requirements, displaying just a few samples in a browser like IGV, etc.). SelectVariants can be used for this purpose.
* Given a single VCF file, one or more samples can be extracted from the file (based on a complete sample name or a
* pattern match). Variants can be further selected by specifying criteria for inclusion, i.e. "DP > 1000" (depth of
* coverage greater than 1000x), "AF < 0.25" (sites with allele frequency less than 0.25). These JEXL expressions are
* documented in the Using JEXL expressions section (http://www.broadinstitute.org/gsa/wiki/index.php/Using_JEXL_expressions).
* One can optionally include concordance or discordance tracks for use in selecting overlapping variants.
*
* <h2>Input</h2>
* <p>
* A variant set to select from.
* </p>
*
* <h2>Output</h2>
* <p>
* A selected VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* Select two samples out of a VCF with many samples:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn SAMPLE_A_PARC \
* -sn SAMPLE_B_ACTG
*
* Select two samples and any sample that matches a regular expression:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn SAMPLE_1_PARC \
* -sn SAMPLE_1_ACTG \
* -sn 'SAMPLE.+PARC'
*
* Select any sample that matches a regular expression and sites where the QD annotation is more than 10:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn 'SAMPLE.+PARC'
* -select "QD > 10.0"
*
* Select a sample and exclude non-variant loci and filtered loci:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -sn SAMPLE_1_ACTG \
* -env \
* -ef
*
* Select a sample and restrict the output vcf to a set of intervals:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -L /path/to/my.interval_list \
* -sn SAMPLE_1_ACTG
*
* Select all calls missed in my vcf, but present in HapMap (useful to take a look at why these variants weren't called by this dataset):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant hapmap.vcf \
* --discordance myCalls.vcf
* -o output.vcf \
* -sn mySample
*
* Select all calls made by both myCalls and hisCalls (useful to take a look at what is consistent between the two callers):
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant myCalls.vcf \
* --concordance hisCalls.vcf
* -o output.vcf \
* -sn mySample
*
* Generating a VCF of all the variants that are mendelian violations:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -SM family.yaml \
* -family NA12891+NA12892=NA12878 \
* -mvq 50
*
* Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -number 1000
*
* Creating a set with 50% of the total number of variants in the variant VCF:
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T SelectVariants \
* --variant input.vcf \
* -o output.vcf \
* -fraction 0.5
*
* </pre>
*
*/
public class SelectVariants extends RodWalker<Integer, Integer> {
@ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
/**
* A site is considered discordant if there exists some sample in eval that has a non-reference genotype
* A site is considered discordant if there exists some sample in the variant track that has a non-reference genotype
* and either the site isn't present in this track, the sample isn't present in this track,
* or the sample is called reference in this track.
*/
@Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this Feature comparison track", required=false)
@Input(fullName="discordance", shortName = "disc", doc="Output variants that were not called in this comparison track", required=false)
private RodBinding<VariantContext> discordanceTrack;
/**
* A site is considered concordant if (1) we are not looking for specific samples and there is a variant called
* in both variants and concordance tracks or (2) every sample present in eval is present in the concordance
* track and they have the sample genotype call.
* in both the variant and concordance tracks or (2) every sample present in the variant track is present in the
* concordance track and they have the sample genotype call.
*/
@Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this Feature comparison track", required=false)
@Input(fullName="concordance", shortName = "conc", doc="Output variants that were also called in this comparison track", required=false)
private RodBinding<VariantContext> concordanceTrack;
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfWriter = null;
@Argument(fullName="sample_name", shortName="sn", doc="Sample name to be included in the analysis. Can be specified multiple times.", required=false)
public Set<String> sampleNames;
@Argument(fullName="sample_name", shortName="sn", doc="Include genotypes from this sample. Can be specified multiple times", required=false)
public Set<String> sampleNames = new HashSet<String>(0);
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times.", required=false)
public Set<String> sampleExpressions;
@Argument(fullName="sample_expressions", shortName="se", doc="Regular expression to select many samples from the ROD tracks provided. Can be specified multiple times", required=false)
public Set<String> sampleExpressions ;
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line). Can be specified multiple times", required=false)
@Argument(fullName="sample_file", shortName="sf", doc="File containing a list of samples (one per line) to include. Can be specified multiple times", required=false)
public Set<File> sampleFiles;
@Argument(shortName="select", doc="One or more criteria to use when selecting the data. Evaluated *after* the specified samples are extracted and the INFO-field annotations are updated.", required=false)
/**
* Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
*/
@Argument(fullName="exclude_sample_name", shortName="xl_sn", doc="Exclude genotypes from this sample. Can be specified multiple times", required=false)
public Set<String> XLsampleNames = new HashSet<String>(0);
/**
* Note that sample exclusion takes precedence over inclusion, so that if a sample is in both lists it will be excluded.
*/
@Argument(fullName="exclude_sample_file", shortName="xl_sf", doc="File containing a list of samples (one per line) to exclude. Can be specified multiple times", required=false)
public Set<File> XLsampleFiles = new HashSet<File>(0);
/**
* Note that these expressions are evaluated *after* the specified samples are extracted and the INFO field annotations are updated.
*/
@Argument(shortName="select", doc="One or more criteria to use when selecting the data", required=false)
public ArrayList<String> SELECT_EXPRESSIONS = new ArrayList<String>();
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
@Argument(fullName="excludeNonVariants", shortName="env", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false)
private boolean EXCLUDE_NON_VARIANTS = false;
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis.", required=false)
@Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
private boolean EXCLUDE_FILTERED = false;
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't include filtered loci.", required=false)
@Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
private boolean KEEP_ORIGINAL_CHR_COUNTS = false;
@Hidden
@Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure.", required=false)
@Argument(fullName="keepAFSpectrum", shortName="keepAF", doc="Don't include loci found to be non-variant after the subsetting procedure", required=false)
private boolean KEEP_AF_SPECTRUM = false;
@Hidden
@ -108,30 +238,43 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
@Argument(fullName="family_structure_file", shortName="familyFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private File FAMILY_STRUCTURE_FILE = null;
@Argument(fullName="family_structure", shortName="family", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
/**
* String formatted as dad+mom=child where these parameters determine which sample names are examined.
*/
@Argument(fullName="family_structure", shortName="family", doc="Deprecated; use the -SM argument instead", required=false)
private String FAMILY_STRUCTURE = "";
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only. Sample metadata information will be taken from YAML file (passed with -SM)", required=false)
/**
* Sample metadata information will be taken from a YAML file (see the -SM argument).
*/
@Argument(fullName="mendelianViolation", shortName="mv", doc="output mendelian violation sites only", required=false)
private Boolean MENDELIAN_VIOLATIONS = false;
@Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false)
private double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0;
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track. Variants are kept in memory to guarantee that n variants will be output, so use it only for a reasonable number of variants. Use select_random_fraction for larger numbers of variants", required=false)
/**
* Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so use it only for a reasonable
* number of variants. Use --select_random_fraction for larger numbers of variants.
*/
@Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false)
private int numRandom = 0;
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track. Routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions", required=false)
/**
* This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions.
*/
@Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
private double fractionRandom = 0;
@Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs.", required=false)
@Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false)
private boolean SELECT_SNPS = false;
@Argument(fullName="selectIndels", shortName="indels", doc="Select only Indels.", required=false)
@Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false)
private boolean SELECT_INDELS = false;
@Hidden
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String outMVFile = null;
@Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
private String outMVFile = null;
/* Private class used to store the intermediate variants in the integer random selection process */
private class RandomVariantStructure {
@ -173,8 +316,7 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
private ArrayList<Double> afBoosts = null;
double bkDelta = 0.0;
private PrintStream outMVFileStream = null;
private PrintStream outMVFileStream = null;
/**
@ -190,19 +332,27 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
Collection<String> samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles);
Collection<String> samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions);
// first, add any requested samples
samples.addAll(samplesFromFile);
samples.addAll(samplesFromExpressions);
if (sampleNames != null)
samples.addAll(sampleNames);
samples.addAll(sampleNames);
if(samples.isEmpty()) {
// if none were requested, we want all of them
if ( samples.isEmpty() ) {
samples.addAll(vcfSamples);
NO_SAMPLES_SPECIFIED = true;
}
for (String sample : samples) {
// now, exclude any requested samples
Collection<String> XLsamplesFromFile = SampleUtils.getSamplesFromFiles(XLsampleFiles);
samples.removeAll(XLsamplesFromFile);
samples.removeAll(XLsampleNames);
if ( samples.size() == 0 && !NO_SAMPLES_SPECIFIED )
throw new UserException("All samples requested to be included were also requested to be excluded.");
for ( String sample : samples )
logger.info("Including sample '" + sample + "'");
}
// Initialize VCF header
Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);

View File

@ -25,7 +25,6 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broad.tribble.Feature;
import org.broad.tribble.TribbleException;
import org.broad.tribble.dbsnp.DbSNPFeature;
import org.broadinstitute.sting.commandline.*;
@ -34,7 +33,6 @@ import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgume
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -48,7 +46,32 @@ import java.util.Set;
/**
* Validates a variants file.
* Strictly validates a variants file.
*
* <p>
* ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it.
* Checks include the correctness of the reference base(s), accuracy of AC & AN values, tests against rsIDs
* when a dbSNP file is provided, and that all alternate alleles are present in at least one sample.
*
* <h2>Input</h2>
* <p>
* A variant set to filter.
* </p>
*
* <h2>Output</h2>
* <p>
* A filtered VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T ValidateVariants \
* --variant input.vcf \
* --dbsnp dbsnp.vcf
* </pre>
*
*/
@Reference(window=@Window(start=0,stop=100))
public class ValidateVariants extends RodWalker<Integer, Integer> {
@ -67,10 +90,13 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
@Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false)
protected ValidationType type = ValidationType.ALL;
@Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "should we skip validation on filtered records?", required = false)
/**
* By default, even filtered records are validated.
*/
@Argument(fullName = "doNotValidateFilteredRecords", shortName = "doNotValidateFilteredRecords", doc = "skip validation on filtered records", required = false)
protected Boolean DO_NOT_VALIDATE_FILTERED = false;
@Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "should we just emit warnings on errors instead of terminating the run?", required = false)
@Argument(fullName = "warnOnErrors", shortName = "warnOnErrors", doc = "just emit warnings on errors instead of terminating the run at the first instance", required = false)
protected Boolean WARN_ON_ERROR = false;
private long numErrors = 0;
@ -111,11 +137,11 @@ public class ValidateVariants extends RodWalker<Integer, Integer> {
Allele reportedRefAllele = vc.getReference();
Allele observedRefAllele;
// insertions
if ( vc.isInsertion() ) {
if ( vc.isSimpleInsertion() ) {
observedRefAllele = Allele.create(Allele.NULL_ALLELE_STRING);
}
// deletions
else if ( vc.isDeletion() || vc.isMixed() || vc.isMNP() ) {
else if ( vc.isSimpleDeletion() || vc.isMixed() || vc.isMNP() ) {
// we can't validate arbitrarily long deletions
if ( reportedRefAllele.length() > 100 ) {
logger.info(String.format("Reference allele is too long (%d) at position %s:%d; skipping that record.", reportedRefAllele.length(), vc.getChr(), vc.getStart()));

View File

@ -25,10 +25,8 @@
package org.broadinstitute.sting.gatk.walkers.variantutils;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -43,21 +41,57 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
import java.util.*;
/**
* Converts Sequenom files to a VCF annotated with QC metrics (HW-equilibrium, % failed probes)
* Annotates a validation (from e.g. Sequenom) VCF with QC metrics (HW-equilibrium, % failed probes)
*
* <p>
* The Variant Validation Assessor is a tool for vetting/assessing validation data (containing genotypes).
* The tool produces a VCF that is annotated with information pertaining to plate quality control and by
* default is soft-filtered by high no-call rate or low Hardy-Weinberg probability.
* If you have .ped files, please first convert them to VCF format
* (see http://www.broadinstitute.org/gsa/wiki/index.php/Converting_ped_to_vcf).
*
* <h2>Input</h2>
* <p>
* A validation VCF to annotate.
* </p>
*
* <h2>Output</h2>
* <p>
* An annotated VCF.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantValidationAssessor \
* --variant input.vcf \
* -o output.vcf
* </pre>
*
*/
@Reference(window=@Window(start=0,stop=40))
public class VariantValidationAssessor extends RodWalker<VariantContext,Integer> {
@Input(fullName="variants", shortName = "V", doc="Input VCF file", required=true)
public RodBinding<VariantContext> variants;
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@Output(doc="File to which variants should be written",required=true)
protected VCFWriter vcfwriter = null;
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid [default:20]", required=false)
@Argument(fullName="maxHardy", doc="Maximum phred-scaled Hardy-Weinberg violation pvalue to consider an assay valid", required=false)
protected double maxHardy = 20.0;
@Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid [default:0.05]", required=false)
/**
* To disable, set to a value greater than 1.
*/
@Argument(fullName="maxNoCall", doc="Maximum no-call rate (as a fraction) to consider an assay valid", required=false)
protected double maxNoCall = 0.05;
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid [default:1.1, disabled]", required=false)
/**
* To disable, set to a value greater than 1.
*/
@Argument(fullName="maxHomVar", doc="Maximum homozygous variant rate (as a fraction) to consider an assay valid", required=false)
protected double maxHomNonref = 1.1;
//@Argument(fullName="populationFile", shortName="populations", doc="A tab-delimited file relating individuals to populations,"+
@ -93,7 +127,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
if ( tracker == null )
return null;
VariantContext vc = tracker.getFirstValue(variants, ref.getLocus());
VariantContext vc = tracker.getFirstValue(variantCollection.variants, ref.getLocus());
// ignore places where we don't have a variant
if ( vc == null )
return null;
@ -101,7 +135,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
if ( sampleNames == null )
sampleNames = new TreeSet<String>(vc.getSampleNames());
return addVariantInformationToCall(ref, vc);
return addVariantInformationToCall(vc);
}
public Integer reduce(VariantContext call, Integer numVariants) {
@ -113,7 +147,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
}
public void onTraversalDone(Integer finalReduce) {
final List<String> inputNames = Arrays.asList(variants.getName());
final List<String> inputNames = Arrays.asList(variantCollection.variants.getName());
// setup the header fields
Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
@ -159,7 +193,7 @@ public class VariantValidationAssessor extends RodWalker<VariantContext,Integer>
}
private VariantContext addVariantInformationToCall(ReferenceContext ref, VariantContext vContext) {
private VariantContext addVariantInformationToCall(VariantContext vContext) {
// check possible filters
double hwPvalue = hardyWeinbergCalculation(vContext);

View File

@ -40,95 +40,109 @@ import java.io.PrintStream;
import java.util.*;
/**
* Emits specific fields as dictated by the user from one or more VCF files.
* Emits specific fields from a VCF file to a tab-deliminated table
*
* <p>
* This walker accepts a single VCF file and writes out user-selected fields from the
* VCF as a header-containing, tab-deliminated file. The user specifies one or more
* fields to print with the -F NAME, each of which appears as a single column in
* the output file, with a header named NAME, and the value of this field in the VCF
* one per line. NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding
* in the INFO field (AC=10). Note that this tool does not support capturing any
* GENOTYPE field values. If a VCF record is missing a value, then the tool by
* default throws an error, but the special value NA can be emitted instead with
* appropriate tool arguments.
*
* </p>
*
* <h2>Input</h2>
* <p>
* <ul>
* <li>A VCF file</li>
* <li>A list of -F fields to write</li>
* </ul>
* </p>
*
* <h2>Output</h2>
* <p>
* A table deliminated file containing the values of the requested fields in the VCF file
* </p>
*
* <h2>Examples</h2>
* <pre>
* -T $WalkerName \
* -V file.vcf \
* -F CHROM -F POS -F ID -F QUAL -F AC \
* -o results.table
*
* would produce a file that looks like:
*
* CHROM POS ID QUAL AC
* 1 10 . 50 1
* 1 20 rs10 99 10
* et cetera...
* </pre>
*
* @author Mark DePristo
* @since 2010
*/
public class VariantsToTable extends RodWalker<Integer, Integer> {
@ArgumentCollection
protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection();
@Output(doc="File to which results should be written",required=true)
protected PrintStream out;
@Argument(fullName="fields", shortName="F", doc="Fields to emit from the VCF, allows any VCF field, any info field, and some meta fields like nHets", required=true)
public ArrayList<String> fieldsToTake = new ArrayList<String>();
/**
* -F NAME can be any standard VCF column (CHROM, ID, QUAL) or any binding in the INFO field (e.g., AC=10).
* Note that this tool does not support capturing any GENOTYPE field values. Note this argument
* accepts any number of inputs. So -F CHROM -F POS is allowed.
*/
@Argument(fullName="fields", shortName="F", doc="The name of each field to capture for output in the table", required=true)
public List<String> fieldsToTake = new ArrayList<String>();
@Argument(fullName="showFiltered", shortName="raw", doc="Include filtered records")
/**
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
* Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
*/
@Advanced
@Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
public boolean showFiltered = false;
@Argument(fullName="maxRecords", shortName="M", doc="Maximum number of records to emit, if provided", required=false)
/**
* If provided, then this tool will exit with success after this number of records have been emitted to the file.
*/
@Advanced
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
public int MAX_RECORDS = -1;
int nRecords = 0;
/**
* By default, only biallelic (REF=A, ALT=B) sites are including in the output. If this flag is provided, then
* VariantsToTable will emit field values for records with multiple ALT alleles. Note that in general this
* can make your resulting file unreadable and malformated according to tools like R, as the representation of
* multi-allelic INFO field values can be lists of values.
*/
@Advanced
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
public boolean keepMultiAllelic = false;
/**
* By default, this tool throws a UserException when it encounters a field without a value in some record. This
* is generally useful when you mistype -F CHRMO, so that you get a friendly warning about CHRMO not being
* found before the tool runs through 40M 1000G records. However, in some cases you genuinely want to allow such
* fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
* will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
*/
@Advanced
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
public boolean ALLOW_MISSING_DATA = false;
public void initialize() {
// print out the header
out.println(Utils.join("\t", fieldsToTake));
}
public static abstract class Getter { public abstract String get(VariantContext vc); }
public static Map<String, Getter> getters = new HashMap<String, Getter>();
static {
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
getters.put("REF", new Getter() {
public String get(VariantContext vc) {
String x = "";
if ( vc.hasReferenceBaseForIndel() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x=x+new String(new byte[]{refByte});
}
return x+vc.getReference().getDisplayString();
}
});
getters.put("ALT", new Getter() {
public String get(VariantContext vc) {
StringBuilder x = new StringBuilder();
int n = vc.getAlternateAlleles().size();
if ( n == 0 ) return ".";
if ( vc.hasReferenceBaseForIndel() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x.append(new String(new byte[]{refByte}));
}
for ( int i = 0; i < n; i++ ) {
if ( i != 0 ) x.append(",");
x.append(vc.getAlternateAllele(i).getDisplayString());
}
return x.toString();
}
});
getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
if ( vc.isSNP() && vc.isBiallelic() )
return VariantContextUtils.isTransition(vc) ? "1" : "0";
else
return "-1";
}});
getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
});
getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
}});
}
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
if ( tracker == null ) // RodWalkers can make funky map calls
return 0;
@ -155,6 +169,15 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return s.endsWith("*");
}
/**
* Utility function that returns the list of values for each field in fields from vc.
*
* @param vc the VariantContext whose field values we can to capture
* @param fields a non-null list of fields to capture from VC
* @param allowMissingData if false, then throws a UserException if any field isn't found in vc. Otherwise
* provides a value of NA
* @return
*/
public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
List<String> vals = new ArrayList<String>();
@ -183,28 +206,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
}
if (field.equals("AF") || field.equals("AC")) {
String afo = val;
double af=0;
if (afo.contains(",")) {
String[] afs = afo.split(",");
afs[0] = afs[0].substring(1,afs[0].length());
afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
double[] afd = new double[afs.length];
for (int k=0; k < afd.length; k++)
afd[k] = Double.valueOf(afs[k]);
af = MathUtils.arrayMax(afd);
//af = Double.valueOf(afs[0]);
}
else
if (!afo.equals("NA"))
af = Double.valueOf(afo);
val = Double.toString(af);
if (val.contains(",")) {
// strip [,] and spaces
val = val.replace("[","");
val = val.replace("]","");
val = val.replace(" ","");
}
}
vals.add(val);
@ -213,13 +220,75 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
return vals;
}
public Integer reduceInit() {
return 0;
}
public Integer reduce(Integer counter, Integer sum) {
return counter + sum;
}
//
// default reduce -- doesn't do anything at all
//
public Integer reduceInit() { return 0; }
public Integer reduce(Integer counter, Integer sum) { return counter + sum; }
public void onTraversalDone(Integer sum) {}
// ----------------------------------------------------------------------------------------------------
//
// static system for getting values from VC by name.
//
// ----------------------------------------------------------------------------------------------------
public static abstract class Getter { public abstract String get(VariantContext vc); }
public static Map<String, Getter> getters = new HashMap<String, Getter>();
static {
// #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
getters.put("CHROM", new Getter() { public String get(VariantContext vc) { return vc.getChr(); } });
getters.put("POS", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getStart()); } });
getters.put("REF", new Getter() {
public String get(VariantContext vc) {
String x = "";
if ( vc.hasReferenceBaseForIndel() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x=x+new String(new byte[]{refByte});
}
return x+vc.getReference().getDisplayString();
}
});
getters.put("ALT", new Getter() {
public String get(VariantContext vc) {
StringBuilder x = new StringBuilder();
int n = vc.getAlternateAlleles().size();
if ( n == 0 ) return ".";
if ( vc.hasReferenceBaseForIndel() ) {
Byte refByte = vc.getReferenceBaseForIndel();
x.append(new String(new byte[]{refByte}));
}
for ( int i = 0; i < n; i++ ) {
if ( i != 0 ) x.append(",");
x.append(vc.getAlternateAllele(i).getDisplayString());
}
return x.toString();
}
});
getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
if ( vc.isSNP() && vc.isBiallelic() )
return VariantContextUtils.isTransition(vc) ? "1" : "0";
else
return "-1";
}});
getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
});
getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
getters.put("NO-CALL", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNoCallCount()); } });
getters.put("VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount() + vc.getHomVarCount()); } });
getters.put("NSAMPLES", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples()); } });
getters.put("NCALLED", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getNSamples() - vc.getNoCallCount()); } });
getters.put("GQ", new Getter() { public String get(VariantContext vc) {
if ( vc.getNSamples() > 1 ) throw new UserException("Cannot get GQ values for multi-sample VCF");
return String.format("%.2f", 10 * vc.getGenotype(0).getNegLog10PError());
}});
}
}

View File

@ -33,7 +33,6 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.VariantContextAdaptors;
import org.broadinstitute.sting.gatk.refdata.features.DbSNPHelper;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.walkers.*;
@ -53,6 +52,30 @@ import java.util.*;
/**
* Converts variants from other file formats to VCF format.
*
* <p>
* Note that there must be a Tribble feature/codec for the file format as well as an adaptor.
*
* <h2>Input</h2>
* <p>
* A variant file to filter.
* </p>
*
* <h2>Output</h2>
* <p>
* A VCF file.
* </p>
*
* <h2>Examples</h2>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T VariantsToVCF \
* -o output.vcf \
* --variant:RawHapMap input.hapmap \
* --dbsnp dbsnp.vcf
* </pre>
*
*/
@Reference(window=@Window(start=-40,stop=40))
public class VariantsToVCF extends RodWalker<Integer, Integer> {
@ -61,15 +84,24 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
protected VCFWriter baseWriter = null;
private SortingVCFWriter vcfwriter; // needed because hapmap/dbsnp indel records move
/**
* Variants from this input file are used by this tool as input.
*/
@Input(fullName="variant", shortName = "V", doc="Input variant file", required=true)
public RodBinding<Feature> variants;
@ArgumentCollection
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod (for data like GELI with genotypes)", required=false)
/**
* This argument is used for data (like GELI) with genotypes but no sample names encoded within.
*/
@Argument(fullName="sample", shortName="sample", doc="The sample name represented by the variant rod", required=false)
protected String sampleName = null;
/**
* This argument is useful for fixing input VCFs with bad reference bases (the output will be a fixed version of the VCF).
*/
@Argument(fullName="fixRef", shortName="fixRef", doc="Fix common reference base in case there's an indel without padding", required=false)
protected boolean fixReferenceBase = false;
@ -87,7 +119,7 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
if ( tracker == null || !BaseUtils.isRegularBase(ref.getBase()) )
return 0;
String rsID = dbsnp == null ? null : DbSNPHelper.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP);
String rsID = dbsnp == null ? null : VCFUtils.rsIDOfFirstRealVariant(tracker.getValues(dbsnp.dbsnp, context.getLocation()), VariantContext.Type.SNP);
Collection<VariantContext> contexts = getVariantContexts(tracker, ref);
@ -135,8 +167,8 @@ public class VariantsToVCF extends RodWalker<Integer, Integer> {
continue;
Map<String, Allele> alleleMap = new HashMap<String, Allele>(2);
alleleMap.put(RawHapMapFeature.DELETION, Allele.create(Allele.NULL_ALLELE_STRING, dbsnpVC.isInsertion()));
alleleMap.put(RawHapMapFeature.INSERTION, Allele.create(((RawHapMapFeature)record).getAlleles()[1], !dbsnpVC.isInsertion()));
alleleMap.put(RawHapMapFeature.DELETION, Allele.create(Allele.NULL_ALLELE_STRING, dbsnpVC.isSimpleInsertion()));
alleleMap.put(RawHapMapFeature.INSERTION, Allele.create(((RawHapMapFeature)record).getAlleles()[1], !dbsnpVC.isSimpleInsertion()));
hapmap.setActualAlleles(alleleMap);
// also, use the correct positioning for insertions

View File

@ -0,0 +1,100 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.jna.drmaa.v1_0;
import org.ggf.drmaa.DrmaaException;
import org.ggf.drmaa.JobInfo;
import java.util.Map;
/**
* JNA mapping from Java to C DRMAA binding.
*/
public class JnaJobInfo implements JobInfo {
private final String jobId;
private final Map<String, String> rusage;
private final boolean hasExited;
private final int exitStatus;
private final boolean hasSignaled;
private final String terminatingSignal;
private final boolean hasCoreDump;
private final boolean wasAborted;
public JnaJobInfo(String jobId, Map<String, String> rusage, boolean hasExited, int exitStatus, boolean hasSignaled, String terminatingSignal, boolean hasCoreDump, boolean wasAborted) {
this.jobId = jobId;
this.rusage = rusage;
this.hasExited = hasExited;
this.exitStatus = exitStatus;
this.hasSignaled = hasSignaled;
this.terminatingSignal = terminatingSignal;
this.hasCoreDump = hasCoreDump;
this.wasAborted = wasAborted;
}
@Override
public String getJobId() throws DrmaaException {
return this.jobId;
}
@Override
public Map getResourceUsage() throws DrmaaException {
return rusage;
}
@Override
public boolean hasExited() throws DrmaaException {
return hasExited;
}
@Override
public int getExitStatus() throws DrmaaException {
if (!hasExited)
throw new IllegalStateException("job has not exited");
return exitStatus;
}
@Override
public boolean hasSignaled() throws DrmaaException {
return hasSignaled;
}
@Override
public String getTerminatingSignal() throws DrmaaException {
if (!hasSignaled)
throw new IllegalStateException("job has not signaled");
return terminatingSignal;
}
@Override
public boolean hasCoreDump() throws DrmaaException {
return hasCoreDump;
}
@Override
public boolean wasAborted() throws DrmaaException {
return wasAborted;
}
}

View File

@ -0,0 +1,315 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.jna.drmaa.v1_0;
import com.sun.jna.Pointer;
import org.ggf.drmaa.*;
import java.util.*;
/**
* JNA mapping from Java to C DRMAA binding.
*/
public class JnaJobTemplate implements JobTemplate {
private final JnaSession session;
private final Pointer jt;
public JnaJobTemplate(JnaSession session, Pointer jt) {
this.session = session;
this.jt = jt;
}
public Pointer getPointer() {
return jt;
}
@Override
public void setRemoteCommand(String s) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_REMOTE_COMMAND, s);
}
@Override
public String getRemoteCommand() throws DrmaaException {
return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_REMOTE_COMMAND);
}
@SuppressWarnings("unchecked")
@Override
public void setArgs(List list) throws DrmaaException {
JnaSession.setVectorAttribute(jt, LibDrmaa.DRMAA_V_ARGV, list);
}
@Override
public List getArgs() throws DrmaaException {
return JnaSession.getVectorAttribute(jt, LibDrmaa.DRMAA_V_ARGV);
}
@Override
public void setJobSubmissionState(int state) throws DrmaaException {
String stateString;
if (state == JobTemplate.HOLD_STATE)
stateString = LibDrmaa.DRMAA_SUBMISSION_STATE_HOLD;
else if (state == JobTemplate.ACTIVE_STATE)
stateString = LibDrmaa.DRMAA_SUBMISSION_STATE_ACTIVE;
else
throw new InvalidAttributeValueException("jobSubmissionState attribute is invalid");
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JS_STATE, stateString);
}
@Override
public int getJobSubmissionState() throws DrmaaException {
int state;
String stateString = JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JS_STATE);
if (LibDrmaa.DRMAA_SUBMISSION_STATE_HOLD.equals(stateString))
state = JobTemplate.HOLD_STATE;
else if (LibDrmaa.DRMAA_SUBMISSION_STATE_ACTIVE.equals(stateString))
state = JobTemplate.ACTIVE_STATE;
else
throw new InvalidAttributeValueException("jobSubmissionState attribute is invalid");
return state;
}
@SuppressWarnings("unchecked")
@Override
public void setJobEnvironment(Map env) throws DrmaaException {
JnaSession.setVectorAttribute(jt, LibDrmaa.DRMAA_V_ENV, JnaSession.mapToCollection(env));
}
@SuppressWarnings("unchecked")
@Override
public Map getJobEnvironment() throws DrmaaException {
return JnaSession.collectionToMap(JnaSession.getVectorAttribute(jt, LibDrmaa.DRMAA_V_ENV));
}
@Override
public void setWorkingDirectory(String s) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_WD, s);
}
@Override
public String getWorkingDirectory() throws DrmaaException {
return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_WD);
}
@Override
public void setJobCategory(String s) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JOB_CATEGORY, s);
}
@Override
public String getJobCategory() throws DrmaaException {
return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JOB_CATEGORY);
}
@Override
public void setNativeSpecification(String s) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_NATIVE_SPECIFICATION, s);
}
@Override
public String getNativeSpecification() throws DrmaaException {
return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_NATIVE_SPECIFICATION);
}
@SuppressWarnings("unchecked")
@Override
public void setEmail(Set set) throws DrmaaException {
JnaSession.setVectorAttribute(jt, LibDrmaa.DRMAA_V_EMAIL, set);
}
@SuppressWarnings("unchecked")
@Override
public Set getEmail() throws DrmaaException {
return new LinkedHashSet<String>(JnaSession.getVectorAttribute(jt, LibDrmaa.DRMAA_V_EMAIL));
}
@Override
public void setBlockEmail(boolean b) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_BLOCK_EMAIL, b ? "1" : "0");
}
@Override
public boolean getBlockEmail() throws DrmaaException {
return "1".equals(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_BLOCK_EMAIL));
}
@Override
public void setStartTime(PartialTimestamp partialTimestamp) throws DrmaaException {
JnaSession.setPartialTime(jt, LibDrmaa.DRMAA_START_TIME, partialTimestamp);
}
@Override
public PartialTimestamp getStartTime() throws DrmaaException {
return JnaSession.getPartialTime(jt, LibDrmaa.DRMAA_START_TIME);
}
@Override
public void setJobName(String s) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JOB_NAME, s);
}
@Override
public String getJobName() throws DrmaaException {
return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JOB_NAME);
}
@Override
public void setInputPath(String s) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_INPUT_PATH, s);
}
@Override
public String getInputPath() throws DrmaaException {
return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_INPUT_PATH);
}
@Override
public void setOutputPath(String s) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_OUTPUT_PATH, s);
}
@Override
public String getOutputPath() throws DrmaaException {
return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_OUTPUT_PATH);
}
@Override
public void setErrorPath(String s) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_ERROR_PATH, s);
}
@Override
public String getErrorPath() throws DrmaaException {
return JnaSession.getAttribute(jt, LibDrmaa.DRMAA_ERROR_PATH);
}
@Override
public void setJoinFiles(boolean b) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_JOIN_FILES, b ? "y" : "n");
}
@Override
public boolean getJoinFiles() throws DrmaaException {
return "y".equals(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_JOIN_FILES));
}
@Override
public void setTransferFiles(FileTransferMode fileTransferMode) throws DrmaaException {
StringBuilder buf = new StringBuilder();
if (fileTransferMode.getInputStream())
buf.append('i');
if (fileTransferMode.getOutputStream())
buf.append('o');
if (fileTransferMode.getErrorStream())
buf.append('e');
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_TRANSFER_FILES, buf.toString());
}
@Override
public FileTransferMode getTransferFiles() throws DrmaaException {
String mode = JnaSession.getAttribute(jt, LibDrmaa.DRMAA_TRANSFER_FILES);
if (mode == null)
return null;
FileTransferMode fileTransferMode = new FileTransferMode();
fileTransferMode.setInputStream(mode.indexOf('i') >= 0);
fileTransferMode.setOutputStream(mode.indexOf('o') >= 0);
fileTransferMode.setErrorStream(mode.indexOf('e') >= 0);
return fileTransferMode;
}
@Override
public void setDeadlineTime(PartialTimestamp partialTimestamp) throws DrmaaException {
JnaSession.setPartialTime(jt, LibDrmaa.DRMAA_DEADLINE_TIME, partialTimestamp);
}
@Override
public PartialTimestamp getDeadlineTime() throws DrmaaException {
return JnaSession.getPartialTime(jt, LibDrmaa.DRMAA_DEADLINE_TIME);
}
@Override
public void setHardWallclockTimeLimit(long l) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_WCT_HLIMIT, JnaSession.formatLimit(l));
}
@Override
public long getHardWallclockTimeLimit() throws DrmaaException {
return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_WCT_HLIMIT));
}
@Override
public void setSoftWallclockTimeLimit(long l) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_WCT_SLIMIT, JnaSession.formatLimit(l));
}
@Override
public long getSoftWallclockTimeLimit() throws DrmaaException {
return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_WCT_SLIMIT));
}
@Override
public void setHardRunDurationLimit(long l) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_DURATION_HLIMIT, JnaSession.formatLimit(l));
}
@Override
public long getHardRunDurationLimit() throws DrmaaException {
return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_DURATION_HLIMIT));
}
@Override
public void setSoftRunDurationLimit(long l) throws DrmaaException {
JnaSession.setAttribute(jt, LibDrmaa.DRMAA_DURATION_SLIMIT, JnaSession.formatLimit(l));
}
@Override
public long getSoftRunDurationLimit() throws DrmaaException {
return JnaSession.parseLimit(JnaSession.getAttribute(jt, LibDrmaa.DRMAA_DURATION_SLIMIT));
}
@Override
public Set getAttributeNames() throws DrmaaException {
return JnaSession.getAttrNames();
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof JnaJobTemplate))
return false;
JnaJobTemplate other = (JnaJobTemplate) obj;
return this.jt.equals(other.jt) && this.session.equals(other.session);
}
@Override
public int hashCode() {
return jt.hashCode();
}
}

View File

@ -0,0 +1,450 @@
/*
* Copyright (c) 2011, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.jna.drmaa.v1_0;
import com.sun.jna.Memory;
import com.sun.jna.NativeLong;
import com.sun.jna.Pointer;
import com.sun.jna.StringArray;
import com.sun.jna.ptr.IntByReference;
import com.sun.jna.ptr.PointerByReference;
import org.ggf.drmaa.*;
import java.text.ParseException;
import java.util.*;
/**
* JNA mapping from Java to C DRMAA binding.
* See: Java and C Binding Documents on http://drmaa.org
*/
public class JnaSession implements Session {
private static final PartialTimestampFormat PARTIAL_TIMESTAMP_FORMAT = new PartialTimestampFormat();
private static final ThreadLocal<Memory> threadError = new ThreadLocal<Memory>() {
@Override
protected Memory initialValue() {
return new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER);
}
};
@Override
public void init(String contact) throws DrmaaException {
checkError(LibDrmaa.drmaa_init(contact, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
}
@Override
public void exit() throws DrmaaException {
checkError(LibDrmaa.drmaa_exit(getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
}
@Override
public JobTemplate createJobTemplate() throws DrmaaException {
PointerByReference jtRef = new PointerByReference();
checkError(LibDrmaa.drmaa_allocate_job_template(jtRef, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
return new JnaJobTemplate(this, jtRef.getValue());
}
@Override
public void deleteJobTemplate(JobTemplate jobTemplate) throws DrmaaException {
JnaJobTemplate jnaJobTemplate = (JnaJobTemplate) jobTemplate;
checkError(LibDrmaa.drmaa_delete_job_template(jnaJobTemplate.getPointer(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
}
@Override
public String runJob(JobTemplate jobTemplate) throws DrmaaException {
Memory jobId = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER);
JnaJobTemplate jnaJobTemplate = (JnaJobTemplate) jobTemplate;
checkError(LibDrmaa.drmaa_run_job(jobId, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, jnaJobTemplate.getPointer(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
return jobId.getString(0);
}
@Override
public List runBulkJobs(JobTemplate jobTemplate, int start, int end, int incr) throws DrmaaException {
PointerByReference jobIds = new PointerByReference();
JnaJobTemplate jnaJobTemplate = (JnaJobTemplate) jobTemplate;
checkError(LibDrmaa.drmaa_run_bulk_jobs(jobIds, jnaJobTemplate.getPointer(), start, end, incr, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
try {
return getJobIds(jobIds);
} finally {
releaseJobIds(jobIds);
}
}
@Override
public void control(String jobId, int action) throws DrmaaException {
checkError(LibDrmaa.drmaa_control(jobId, action, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
}
@SuppressWarnings("unchecked")
@Override
public void synchronize(List list, long timeout, boolean dispose) throws DrmaaException {
StringArray jobIds = new StringArray((String[]) list.toArray(new String[list.size()]));
checkError(LibDrmaa.drmaa_synchronize(jobIds, new NativeLong(timeout), dispose ? 1 : 0, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
}
@Override
public JobInfo wait(String jobId, long timeout) throws DrmaaException {
Memory jobIdOut = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER);
IntByReference stat = new IntByReference();
PointerByReference rusage = new PointerByReference();
IntByReference exited = new IntByReference();
IntByReference exitStatus = new IntByReference();
IntByReference signaled = new IntByReference();
Memory signal = new Memory(LibDrmaa.DRMAA_SIGNAL_BUFFER);
IntByReference coreDumped = new IntByReference();
IntByReference aborted = new IntByReference();
int errnum;
errnum = LibDrmaa.drmaa_wait(jobId, jobIdOut, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, stat, new NativeLong(timeout), rusage, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN);
Map<String, String> rusageMap;
if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_RUSAGE) {
rusageMap = null;
} else {
try {
rusageMap = collectionToMap(getAttrValues(rusage));
} finally {
releaseAttrValues(rusage);
}
}
checkError(LibDrmaa.drmaa_wifexited(exited, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
if (exited.getValue() != 0) {
checkError(LibDrmaa.drmaa_wexitstatus(exitStatus, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
}
checkError(LibDrmaa.drmaa_wifsignaled(signaled, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
if (signaled.getValue() != 0) {
checkError(LibDrmaa.drmaa_wtermsig(signal, LibDrmaa.DRMAA_SIGNAL_BUFFER_LEN, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
checkError(LibDrmaa.drmaa_wcoredump(coreDumped, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
}
checkError(LibDrmaa.drmaa_wifaborted(aborted, stat.getValue(), getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
return new JnaJobInfo(jobIdOut.getString(0), rusageMap, exited.getValue() != 0, exitStatus.getValue(),
signaled.getValue() != 0, signal.getString(0), coreDumped.getValue() != 0, aborted.getValue() != 0);
}
@Override
public int getJobProgramStatus(String jobId) throws DrmaaException {
IntByReference remotePs = new IntByReference();
checkError(LibDrmaa.drmaa_job_ps(jobId, remotePs, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
return remotePs.getValue();
}
@Override
public String getContact() {
Memory contact = new Memory(LibDrmaa.DRMAA_CONTACT_BUFFER);
try {
checkError(LibDrmaa.drmaa_get_contact(contact, LibDrmaa.DRMAA_CONTACT_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
} catch (DrmaaException e) {
// DRMAA spec says this method should throw DrmaaException.
// Why doesn't interface implement this?
throw new RuntimeException(e);
}
return contact.getString(0);
}
@Override
public Version getVersion() {
IntByReference major = new IntByReference();
IntByReference minor = new IntByReference();
try {
checkError(LibDrmaa.drmaa_version(major, minor, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
} catch (DrmaaException e) {
// DRMAA spec says this method should throw DrmaaException.
// Why doesn't interface implement this?
throw new RuntimeException(e);
}
return new Version(major.getValue(), minor.getValue());
}
@Override
public String getDrmSystem() {
Memory drmSystem = new Memory(LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER);
try {
checkError(LibDrmaa.drmaa_get_DRM_system(drmSystem, LibDrmaa.DRMAA_DRM_SYSTEM_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
} catch (DrmaaException e) {
// DRMAA spec says this method should throw DrmaaException.
// Why doesn't interface implement this?
throw new RuntimeException(e);
}
return drmSystem.getString(0);
}
@Override
public String getDrmaaImplementation() {
Memory drmaaImplementation = new Memory(LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER);
try {
checkError(LibDrmaa.drmaa_get_DRMAA_implementation(drmaaImplementation, LibDrmaa.DRMAA_DRMAA_IMPLEMENTATION_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
} catch (DrmaaException e) {
// DRMAA spec says this method should throw DrmaaException.
// Why doesn't interface implement this?
throw new RuntimeException(e);
}
return drmaaImplementation.getString(0);
}
public static void setAttribute(Pointer jt, String name, String value) throws DrmaaException {
checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
}
public static String getAttribute(Pointer jt, String name) throws DrmaaException {
Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER);
checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
return attrBuffer.getString(0);
}
public static void setVectorAttribute(Pointer jt, String name, Collection<String> values) throws DrmaaException {
StringArray valuesArray = new StringArray(values.toArray(new String[values.size()]));
checkError(LibDrmaa.drmaa_set_vector_attribute(jt, name, valuesArray, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
}
public static List<String> getVectorAttribute(Pointer jt, String name) throws DrmaaException {
PointerByReference values = new PointerByReference();
checkError(LibDrmaa.drmaa_get_vector_attribute(jt, name, values, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
try {
return getAttrValues(values);
} finally {
releaseAttrValues(values);
}
}
public static void setPartialTime(Pointer jt, String name, PartialTimestamp partialTimestamp) throws DrmaaException {
setAttribute(jt, name, PARTIAL_TIMESTAMP_FORMAT.format(partialTimestamp));
}
public static PartialTimestamp getPartialTime(Pointer jt, String name) throws DrmaaException {
String time = getAttribute(jt, name);
if (time == null)
return null;
try {
return PARTIAL_TIMESTAMP_FORMAT.parse(time);
} catch (ParseException e) {
throw new InternalException(name + " property is unparsable");
}
}
public static Set<String> getAttrNames() throws DrmaaException {
PointerByReference values = new PointerByReference();
checkError(LibDrmaa.drmaa_get_attribute_names(values, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN));
try {
return new LinkedHashSet<String>(getAttrNames(values));
} finally {
releaseAttrNames(values);
}
}
public static Collection<String> mapToCollection(Map<String, String> map) {
Collection<String> collection = new LinkedHashSet<String>();
for (Map.Entry<String, String> entry: map.entrySet())
collection.add(entry.getKey() + "=" + entry.getValue());
return collection;
}
public static Map<String, String> collectionToMap(Collection<String> list) {
Map<String, String> map = new LinkedHashMap<String, String>();
for (String entry: list) {
if (entry == null)
continue;
int equals = entry.indexOf('=');
if (equals < 0)
continue;
map.put(entry.substring(0, equals), entry.substring(equals + 1));
}
return map;
}
public static String formatLimit(long secs) {
long seconds = (secs % 60);
long minutes = (secs / 60) % 60;
long hours = (secs / 3600);
return String.format("%d:%02d:%02d", hours, minutes, seconds);
}
public static long parseLimit(String limit) {
long seconds = 0;
if (limit != null) {
for (String token: limit.split(":")) {
seconds *= 60;
seconds += Long.parseLong(token);
}
}
return seconds;
}
private static List<String> getAttrNames(PointerByReference names) throws DrmaaException {
List<String> namesList = new ArrayList<String>();
IntByReference size = new IntByReference();
int errnum;
errnum = LibDrmaa.drmaa_get_num_attr_names(names.getValue(), size);
checkError(errnum, "unable to get attribute names");
int num = size.getValue();
Memory value = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER);
for (int i = 1; i <= num; i++) {
errnum = LibDrmaa.drmaa_get_next_attr_name(names.getValue(), value, LibDrmaa.DRMAA_ATTR_BUFFER_LEN);
checkError(errnum, "unable to get attribute name " + i);
if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS)
break;
namesList.add(value.getString(0));
}
return namesList;
}
private static List<String> getAttrValues(PointerByReference values) throws DrmaaException {
List<String> valuesList = new ArrayList<String>();
IntByReference size = new IntByReference();
int errnum;
errnum = LibDrmaa.drmaa_get_num_attr_values(values.getValue(), size);
checkError(errnum, "unable to get attribute values");
int num = size.getValue();
Memory value = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER);
for (int i = 1; i <= num; i++) {
errnum = LibDrmaa.drmaa_get_next_attr_value(values.getValue(), value, LibDrmaa.DRMAA_ATTR_BUFFER_LEN);
checkError(errnum, "unable to get attribute value " + i);
if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS)
break;
valuesList.add(value.getString(0));
}
return valuesList;
}
private static List<String> getJobIds(PointerByReference jobIds) throws DrmaaException {
List<String> jobIdsList = new ArrayList<String>();
IntByReference size = new IntByReference();
int errnum;
errnum = LibDrmaa.drmaa_get_num_job_ids(jobIds.getValue(), size);
checkError(errnum, "unable to get jobIds");
int num = size.getValue();
Memory value = new Memory(LibDrmaa.DRMAA_JOBNAME_BUFFER);
for (int i = 1; i <= num; i++) {
errnum = LibDrmaa.drmaa_get_next_job_id(jobIds.getValue(), value, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN);
checkError(errnum, "unable to get jobId " + i);
if (errnum == LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS)
break;
jobIdsList.add(value.getString(0));
}
return jobIdsList;
}
private static void releaseAttrNames(PointerByReference names) throws DrmaaException {
LibDrmaa.drmaa_release_attr_names(names.getValue());
}
private static void releaseAttrValues(PointerByReference values) throws DrmaaException {
LibDrmaa.drmaa_release_attr_values(values.getValue());
}
private static void releaseJobIds(PointerByReference jobIds) throws DrmaaException {
LibDrmaa.drmaa_release_job_ids(jobIds.getValue());
}
private static Memory getError() {
return threadError.get();
}
private static void checkError(int errnum) throws DrmaaException {
if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS)
checkError(errnum, getError().getString(0));
}
private static void checkError(int errnum, String error) throws DrmaaException {
switch (errnum) {
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS:
break;
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INTERNAL_ERROR:
throw new InternalException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DRM_COMMUNICATION_FAILURE:
throw new DrmCommunicationException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_AUTH_FAILURE:
throw new AuthorizationException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_ARGUMENT:
throw new IllegalArgumentException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_ACTIVE_SESSION:
throw new NoActiveSessionException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MEMORY:
throw new OutOfMemoryError(error);
/* -------------- init and exit specific --------------- */
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_CONTACT_STRING:
throw new InvalidContactStringException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DEFAULT_CONTACT_STRING_ERROR:
throw new DefaultContactStringException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_DEFAULT_CONTACT_STRING_SELECTED:
throw new NoDefaultContactStringException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DRMS_INIT_FAILED:
throw new DrmsInitException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_ALREADY_ACTIVE_SESSION:
throw new AlreadyActiveSessionException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DRMS_EXIT_ERROR:
throw new DrmsExitException(error);
/* ---------------- job attributes specific -------------- */
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_ATTRIBUTE_FORMAT:
throw new InvalidAttributeFormatException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_ATTRIBUTE_VALUE:
throw new InvalidAttributeValueException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_CONFLICTING_ATTRIBUTE_VALUES:
throw new ConflictingAttributeValuesException(error);
/* --------------------- job submission specific -------------- */
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_TRY_LATER:
throw new TryLaterException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_DENIED_BY_DRM:
throw new DeniedByDrmException(error);
/* ------------------------------- job control specific ---------------- */
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_INVALID_JOB:
throw new InvalidJobException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_RESUME_INCONSISTENT_STATE:
throw new ResumeInconsistentStateException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUSPEND_INCONSISTENT_STATE:
throw new SuspendInconsistentStateException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_HOLD_INCONSISTENT_STATE:
throw new HoldInconsistentStateException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_RELEASE_INCONSISTENT_STATE:
throw new ReleaseInconsistentStateException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_EXIT_TIMEOUT:
throw new ExitTimeoutException(error);
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_RUSAGE:
break;
case LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_NO_MORE_ELEMENTS:
break;
default:
throw new IllegalArgumentException(String.format("Unknown error code %d: %s", errnum, error));
}
}
}

Some files were not shown because too many files have changed in this diff Show More