Merge pull request #798 from broadinstitute/rhl_split_n_cigar_reads_exception

Rhl split n cigar reads exception
2015-01-15 00:20:49 -05:00 · 2015-01-15 00:20:49 -05:00 · 7102477168
parent 0292d49842 804b2a36b7
commit 7102477168
5 changed files with 79 additions and 46 deletions
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java
@ -78,6 +78,7 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
 import java.io.FileNotFoundException;
 import java.util.Collections;
 import java.util.List;
+import java.util.ArrayList;

 /**
 *
@ -141,8 +142,7 @@ public class SplitNCigarReads extends ReadWalker<GATKSAMRecord, OverhangFixingMa
     * It will emit reads to the underlying writer as needed so we don't need to worry about any of that in this class.
     */
    protected OverhangFixingManager overhangManager;
-    private List<RNAReadTransformer> rnaReadTransformers = Collections.emptyList();
-
+    private List<RNAReadTransformer> rnaReadTransformers = new ArrayList<>();


    @Override
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java
@ -91,6 +91,14 @@ public class SplitNCigarReadsIntegrationTest extends WalkerTest {
        executeTest("test splits with overhangs", spec);
    }

+    @Test
+    public void testSplitsFixNDN() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "splitNCigarReadsSnippet.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS -fixNDN", 1,
+                Arrays.asList("4ee1c1a64847e2b2f660a3a86f9d7e32"));
+        executeTest("test fix NDN", spec);
+    }
+
    @Test
    public void testSplitsWithOverhangsNotClipping() {
        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
--- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java
+++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java
@ -244,11 +244,14 @@ public class GATKArgumentCollection {
    //
    // --------------------------------------------------------------------------------------------------------------
    /**
-     * This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in
-     * a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms.
-     * You should only use this if you know that your reads have that problem.
+     * Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS
+     * when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. These
+     * rarely happen, but when they do they cause GATK to fail with an error. Setting this flag tells the GATK to
+     * reduce "NDN" to a simpler CIGAR representation with one N element (with total length of the three refactored
+     * elements). From the point of view of variant calling, there is no meaningful difference between the two
+     * representations.
     */
-    @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false)
+    @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "Reduce NDN elements in CIGAR string", required = false)
    public boolean REFACTOR_NDN_CIGAR_READS = false;

    // --------------------------------------------------------------------------------------------------------------
--- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java
+++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java
@ -37,19 +37,24 @@ import org.broadinstitute.gatk.utils.exceptions.UserException;
 import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;

 /**
- * A read transformer that refactor NDN cigar elements to one N element.
+ * Reduce NDN cigar elements to one N element.
 *
- *  <p>
- *     This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements).
- *     This is intended primarily for users of RNA-Seq data handling programs such as TopHat2.
- *     Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
- *     those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset.
+ * <p>This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length
+ * of the three refactored elements). The engine parameter that activate this read transformer is
+ * `--refactor_NDN_cigar_string` / `-fixNDN`</p>
 *
- *     NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
- *
- *     The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN
- *  </p>
+ * <h3>Rationale</h3>
+ * <p>Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS
+ * when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. Currently
+ * we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
+ * those specific reads, this read transformer allows users of TopHat and other tools to circumvent this problem without
+ * affecting the rest of their dataset. From the point of view of variant calling, there is no meaningful difference between
+ * the two representations.</p>
 *
+ * <h3>Developer notes</h3>
+ * <ul>
+ *     <li>Any walker that needs to apply this functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.</li>
+ * </ul>
 *
 *
 * @author ami
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java
@ -52,41 +52,36 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter;

 import java.util.*;

+
 /**
- * Combines VCF records from different sources.
+ * Combine variant records from different sources
 *
- * <p>
- * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data
- * and any number of sources can be input. This tool currently supports two different combination types for each of
- * variants (the first 8 fields of the VCF) and genotypes (the rest).
- * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified.
- * Union: assumes each rod represents the same set of samples (although this is not enforced); using the
- * priority list (if provided), it emits a single record instance at every position represented in the rods.
+ * <p>CombineVariants reads in variants records from separate ROD (Reference-Ordered Data) sources and combines them into
+ * a single VCF. Any (unique) name can be used to bind your ROD and any number of sources can be input. This tool aims
+ * to fulfill two main possible use cases, reflected by the two combination options (MERGE and UNION), for merging
+ * records at the variant level (the first 8 fields of the VCF) or at the genotype level (the rest).</p>
 *
- * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD
- * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect,
- * CombineVariants always produces a union of the input VCFs.  However, any part of the Venn of the N merged VCFs
- * can be exacted using JEXL expressions on the set attribute using SelectVariants.  If you want to extract just
+ * <ul>
+ * <li><b>MERGE:</b> combines multiple variant records present at the same site in the different input sources into a
+ * single variant record in the output. If sample names overlap, then they are "uniquified" by default, which means a
+ * suffix is appended to make them unique. <em>Note that in version 3.3, the automatic uniquifying was disabled
+ * (unintentionally), and required setting `-genotypeMergeOptions UNIQUIFY` manually.</em></li>
+ *
+ * <li><b>UNION:</b> assumes that each ROD source represents the same set of samples (although this is not enforced).
+ * It uses the priority list (if provided) to emit a single record instance at every position represented in the input RODs.</li>
+ * </ul>
+ *
+ * <p>CombineVariants will emit a record for every site that was present in any of your input VCF files, and will annotate
+ * (in the set attribute in the INFO field) whether the record had a PASS or FILTER status in each input ROD . In effect,
+ * CombineVariants always produces a union of the input VCFs.  However, any part of the Venn of the merged VCFs
+ * can be extracted using JEXL expressions on the set attribute using SelectVariants.  If you want to extract just
 * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single
- * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
- * in the detailed example in the documentation guide.
- *
- * Note that CombineVariants supports multi-threaded parallelism (8/15/12).  This is particularly useful
- * when converting from VCF to BCF2, which can be expensive.  In this case each thread spends CPU time
- * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
- * efficiency.  However, since this merge runs in only one thread, you can quickly reach diminishing
- * returns with the number of parallel threads.  -nt 4 works well but -nt 8 may be too much.
- *
- * Some fine details about the merging algorithm:
- *   <ul>
- *   <li> As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of
- *      the first VCF record with a non-MISSING QUAL value.  The previous behavior was to take the
- *      max QUAL, which resulted in sometime strange downstream confusion</li>
- *   </ul>
+ * VCF and then run SelectVariants to extract the common records with `-select 'set == "Intersection"'`, as worked out
+ * in the detailed example in the documentation guide.</p>
 *
 * <h3>Input</h3>
 * <p>
- * One or more variant sets to combine.
+ * Two or more variant sets to combine.
 * </p>
 *
 * <h3>Output</h3>
@ -95,6 +90,8 @@ import java.util.*;
 * </p>
 *
 * <h3>Examples</h3>
+ * &nbsp;
+ * <h4>Merge two separate callsets</h4>
 * <pre>
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -R ref.fasta \
@ -103,17 +100,37 @@ import java.util.*;
 *   --variant input2.vcf \
 *   -o output.vcf \
 *   -genotypeMergeOptions UNIQUIFY
- *
+ * </pre>
+ * <h4>Get the union of calls made on the same samples </h4>
+ * <pre>
 * java -Xmx2g -jar GenomeAnalysisTK.jar \
 *   -R ref.fasta \
 *   -T CombineVariants \
 *   --variant:foo input1.vcf \
 *   --variant:bar input2.vcf \
 *   -o output.vcf \
- *   -genotypeMergeOptions PRIORITIZE
+ *   -genotypeMergeOptions PRIORITIZE \
 *   -priority foo,bar
 * </pre>
 *
+ * <h3>Caveats</h3>
+ * <ul>
+ * <li>This tool is not intended to manipulate GVCFS! To combine GVCF files output by HaplotypeCaller, use CombineGVCFs.</li>
+ * <li>To join intermediate VCFs produced by running jobs in parallel by interval (e.g. by chromosome), use CatVariants.</li>
+ * </ul>
+ *
+ * <h3>Additional notes</h3>
+ * <ul>
+ * <li> Using this tool's multi-threaded parallelism capability is particularly useful
+ * when converting from VCF to BCF2, which can be time-consuming. In this case each thread spends CPU time
+ * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
+ * efficiently.  However, since this merge runs in only one thread, you can quickly reach diminishing
+ * returns with the number of parallel threads.  In our hands, `-nt 4` works well but `-nt 8` tends to be be too much.</li>
+ * <li>Since GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of the first
+ * VCF record with a non-MISSING QUAL value.  The previous behavior was to take the max QUAL, which could result
+ * in strange downstream confusion</li>
+ * </ul>
+ *
 */
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-50,stop=50))