Misc documentation improvements

Added caveat to VariantFiltration documentation
  Fixed PON creation example in M2 doc
  Improved MalformedReadFilter doc
  Updated N CIGAR error message
This commit is contained in:
Geraldine Van der Auwera 2016-02-27 13:11:17 -05:00
parent 8c15d3ccd8
commit 2b70f14740
3 changed files with 48 additions and 30 deletions

View File

@ -141,7 +141,7 @@ import static java.lang.Math.pow;
* <pre> * <pre>
* java * java
* -jar GenomeAnalysisTK.jar * -jar GenomeAnalysisTK.jar
* -T HaplotypeCaller * -T MuTect2
* -R reference.fasta * -R reference.fasta
* -I:tumor normal1.bam \ * -I:tumor normal1.bam \
* [--dbsnp dbSNP.vcf] \ * [--dbsnp dbSNP.vcf] \

View File

@ -32,17 +32,36 @@ import org.broadinstitute.gatk.engine.ReadProperties;
import org.broadinstitute.gatk.utils.ValidationExclusion; import org.broadinstitute.gatk.utils.ValidationExclusion;
import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource; import org.broadinstitute.gatk.engine.datasources.reads.SAMDataSource;
import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.help.HelpConstants;
/** /**
* Filter out malformed reads * Filter out malformed reads
* *
* <p>This filter is applied automatically by all GATK tools in order to protect them from crashing on reads that are * <p>This filter is applied automatically by all GATK tools in order to protect them from crashing on reads that are
* grossly malformed. There are a few issues (such as the absence of sequence bases) that will cause the run to fail with an * malformed. There are a few types of malformation (such as the absence of sequence bases) that are not filtered out
* error, but these cases can be preempted by setting flags that cause the problem reads to also be filtered.</p> * by default and can cause errors, but these cases can be preempted by setting flags that cause the problem reads to
* also be filtered.</p>
*
* <h4>Criteria used by default</h4>
* <ul>
* <li><b>Invalid Alignment Start:</b> Read alignment start is inconsistent with the read unmapped flag; either read is not flagged as 'unmapped', but alignment start is NO_ALIGNMENT_START, or read is not flagged as 'unmapped', but alignment start is -1.</li>
* <li><b>Invalid Alignment End:</b> Read aligns to negative number of bases in the reference.</li>
* <li><b>Alignment Disagrees With Header:</b> Read is aligned to nonexistent contig or read is aligned to a point after the end of the contig.</li>
* <li><b>Missing or Undefined Read Group:</b> Either the RG tag is missing, it is not defined in the header, or required elements such as RGID are missing.</li>
* <li><b>Cigar Disagrees With Alignment:</b> Read has a valid alignment start, but the CIGAR string is empty.</li>
* <li><b>CIGAR Is Not Supported:</b> Read CIGAR contains operators that are not supported (N which is treated separately).</li>
* </ul>
*
* <h4>Optional criteria</h4>
* <ul>
* <li><b>Mismatching Bases And Quals:</b> Read does not have the same number of bases and base qualities.</li>
* <li><b>Bases Not Stored:</b> Read with no stored bases, has '*' instead in the SEQ field.</li>
* <li><b>CIGAR With N Operator:</b> Read CIGAR contains N operator (typical of RNA_seq data).</li>
* </ul>
* *
* <h3>Usage example</h3> * <h3>Usage example</h3>
* *
* <h4>Set the malformed read filter to filter out reads that have no sequence bases</h4> * <h4>Set the malformed read filter to also filter out reads that have no stored sequence bases</h4>
* <pre> * <pre>
* java -jar GenomeAnalysisTk.jar \ * java -jar GenomeAnalysisTk.jar \
* -T ToolName \ * -T ToolName \
@ -200,26 +219,13 @@ public class MalformedReadFilter extends ReadFilter {
if (! filterReadsWithNCigar && !allowNCigars) { if (! filterReadsWithNCigar && !allowNCigars) {
throw new UserException.UnsupportedCigarOperatorException( throw new UserException.UnsupportedCigarOperatorException(
CigarOperator.N,read, CigarOperator.N,read,
"Perhaps you are" "If you are working with RNA-Seq data, see " + HelpConstants.articlePost("3891") + " for guidance. "
+ " trying to use RNA-Seq data?" + "If you choose to disregard those instructions, or for other uses, you have the option of either "
+ " While we are currently actively working to" + "filtering out all reads with operator " + CigarOperator.N + " in their CIGAR string" + " (add --"
+ " support this data type unfortunately the" + FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME + " to your command line) or overriding this check (add -U "
+ " GATK cannot be used with this data in its" + ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS + " to your command line). Notice however that the latter "
+ " current form. You have the option of either" + "is unsupported, so if you use it and encounter any problems, the GATK support team not be able to help "
+ " filtering out all reads with operator " + "you.");
+ CigarOperator.N + " in their CIGAR string"
+ " (please add --"
+ FILTER_READS_WITH_N_CIGAR_ARGUMENT_FULL_NAME
+ " to your command line) or"
+ " assume the risk of processing those reads as they"
+ " are including the pertinent unsafe flag (please add -U"
+ ' ' + ValidationExclusion.TYPE.ALLOW_N_CIGAR_READS
+ " to your command line). Notice however that if you were"
+ " to choose the latter, an unspecified subset of the"
+ " analytical outputs of an unspecified subset of the tools"
+ " will become unpredictable. Consequently the GATK team"
+ " might well not be able to provide you with the usual support"
+ " with any issue regarding any output");
} }
return ! filterReadsWithNCigar; return ! filterReadsWithNCigar;
} }

View File

@ -53,9 +53,13 @@ import java.util.*;
* Filter variant calls based on INFO and FORMAT annotations * Filter variant calls based on INFO and FORMAT annotations
* *
* <p> * <p>
* This tool is designed for hard-filtering variant calls based on certain criteria. * This tool is designed for hard-filtering variant calls based on certain criteria. Records are hard-filtered
* Records are hard-filtered by changing the value in the FILTER field to something other than PASS. Filtered records * by changing the value in the FILTER field to something other than PASS. Filtered records will be preserved
* will be preserved in the output unless their removal is requested in the command line. </p> * in the output unless their removal is requested in the command line. </p>
*
* <p>The most common way of specifying filtering criteria is by using JEXL queries. See the
* <a href='https://www.broadinstitute.org/gatk/guide/article?id=1255'> article on JEXL expressions</a> in the
* documentation Guide for detailed information and examples.</p>
* *
* <h3>Input</h3> * <h3>Input</h3>
* <p> * <p>
@ -75,11 +79,19 @@ import java.util.*;
* -o output.vcf \ * -o output.vcf \
* --variant input.vcf \ * --variant input.vcf \
* --filterExpression "AB < 0.2 || MQ0 > 50" \ * --filterExpression "AB < 0.2 || MQ0 > 50" \
* --filterName "Nov09filters" \ * --filterName "SomeFilterName"
* --mask mask.vcf \
* --maskName InDel
* </pre> * </pre>
* *
* <h3>Caveat</h3>
* <p>when you run VariantFiltration with a command that includes multiple logical parts, each part of the command is applied
* individually to the original form of the VCF record. Say you ran a VF command that includes three parts: one applies
* some genotype filters, another applies setFilterGtToNoCall (which changes sample genotypes to ./. whenever a sample has a
* genotype-level FT annotation), and yet another one filters sites based on whether any samples have a no-call there. You might
* think that such a command would allow you to filter sites based on sample-level annotations in one go. However, that would only
* work if the parts of the command were applied internally in series (like a pipeline) but that's not the case; they are applied
* in parallel to the same original record. So unfortunately, to achieve the desired result, these filters should be applied as
* separate commands.</p>
*
*/ */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-50,stop=50)) @Reference(window=@Window(start=-50,stop=50))