diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java
index 2e549263e..9958a4f7b 100644
--- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java
+++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java
@@ -78,6 +78,7 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import java.io.FileNotFoundException;
import java.util.Collections;
import java.util.List;
+import java.util.ArrayList;
/**
*
@@ -141,8 +142,7 @@ public class SplitNCigarReads extends ReadWalker rnaReadTransformers = Collections.emptyList();
-
+ private List rnaReadTransformers = new ArrayList<>();
@Override
diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java
index 6ed7b7d0b..c615315a9 100644
--- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java
+++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java
@@ -91,6 +91,14 @@ public class SplitNCigarReadsIntegrationTest extends WalkerTest {
executeTest("test splits with overhangs", spec);
}
+ @Test
+ public void testSplitsFixNDN() {
+ WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+ "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "splitNCigarReadsSnippet.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS -fixNDN", 1,
+ Arrays.asList("4ee1c1a64847e2b2f660a3a86f9d7e32"));
+ executeTest("test fix NDN", spec);
+ }
+
@Test
public void testSplitsWithOverhangsNotClipping() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java
index 575c195b4..5b6ae51f5 100644
--- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java
+++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java
@@ -244,11 +244,14 @@ public class GATKArgumentCollection {
//
// --------------------------------------------------------------------------------------------------------------
/**
- * This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in
- * a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms.
- * You should only use this if you know that your reads have that problem.
+ * Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS
+ * when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. These
+ * rarely happen, but when they do they cause GATK to fail with an error. Setting this flag tells the GATK to
+ * reduce "NDN" to a simpler CIGAR representation with one N element (with total length of the three refactored
+ * elements). From the point of view of variant calling, there is no meaningful difference between the two
+ * representations.
*/
- @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false)
+ @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "Reduce NDN elements in CIGAR string", required = false)
public boolean REFACTOR_NDN_CIGAR_READS = false;
// --------------------------------------------------------------------------------------------------------------
diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java
index 65bf1eb02..e01827723 100644
--- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java
+++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java
@@ -37,19 +37,24 @@ import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
/**
- * A read transformer that refactor NDN cigar elements to one N element.
+ * Reduce NDN cigar elements to one N element.
*
- *
- * This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements).
- * This is intended primarily for users of RNA-Seq data handling programs such as TopHat2.
- * Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
- * those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset.
+ *
This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length
+ * of the three refactored elements). The engine parameter that activate this read transformer is
+ * `--refactor_NDN_cigar_string` / `-fixNDN`
*
- * NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
- *
- * The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN
- *
+ * Rationale
+ * Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS
+ * when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. Currently
+ * we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
+ * those specific reads, this read transformer allows users of TopHat and other tools to circumvent this problem without
+ * affecting the rest of their dataset. From the point of view of variant calling, there is no meaningful difference between
+ * the two representations.
*
+ * Developer notes
+ *
+ * - Any walker that needs to apply this functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
+ *
*
*
* @author ami
diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java
index 7783b5b31..51f0c40bc 100644
--- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java
+++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java
@@ -52,41 +52,36 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import java.util.*;
+
/**
- * Combines VCF records from different sources.
+ * Combine variant records from different sources
*
- *
- * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data
- * and any number of sources can be input. This tool currently supports two different combination types for each of
- * variants (the first 8 fields of the VCF) and genotypes (the rest).
- * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified.
- * Union: assumes each rod represents the same set of samples (although this is not enforced); using the
- * priority list (if provided), it emits a single record instance at every position represented in the rods.
+ *
CombineVariants reads in variants records from separate ROD (Reference-Ordered Data) sources and combines them into
+ * a single VCF. Any (unique) name can be used to bind your ROD and any number of sources can be input. This tool aims
+ * to fulfill two main possible use cases, reflected by the two combination options (MERGE and UNION), for merging
+ * records at the variant level (the first 8 fields of the VCF) or at the genotype level (the rest).
*
- * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD
- * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect,
- * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs
- * can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just
+ *
+ * - MERGE: combines multiple variant records present at the same site in the different input sources into a
+ * single variant record in the output. If sample names overlap, then they are "uniquified" by default, which means a
+ * suffix is appended to make them unique. Note that in version 3.3, the automatic uniquifying was disabled
+ * (unintentionally), and required setting `-genotypeMergeOptions UNIQUIFY` manually.
+ *
+ * - UNION: assumes that each ROD source represents the same set of samples (although this is not enforced).
+ * It uses the priority list (if provided) to emit a single record instance at every position represented in the input RODs.
+ *
+ *
+ * CombineVariants will emit a record for every site that was present in any of your input VCF files, and will annotate
+ * (in the set attribute in the INFO field) whether the record had a PASS or FILTER status in each input ROD . In effect,
+ * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the merged VCFs
+ * can be extracted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just
* the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single
- * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
- * in the detailed example in the documentation guide.
- *
- * Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful
- * when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time
- * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
- * efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing
- * returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much.
- *
- * Some fine details about the merging algorithm:
- *
- * - As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of
- * the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the
- * max QUAL, which resulted in sometime strange downstream confusion
- *
+ * VCF and then run SelectVariants to extract the common records with `-select 'set == "Intersection"'`, as worked out
+ * in the detailed example in the documentation guide.
*
* Input
*
- * One or more variant sets to combine.
+ * Two or more variant sets to combine.
*
*
* Output
@@ -95,6 +90,8 @@ import java.util.*;
*
*
* Examples
+ *
+ * Merge two separate callsets
*
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
@@ -103,17 +100,37 @@ import java.util.*;
* --variant input2.vcf \
* -o output.vcf \
* -genotypeMergeOptions UNIQUIFY
- *
+ *
+ * Get the union of calls made on the same samples
+ *
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CombineVariants \
* --variant:foo input1.vcf \
* --variant:bar input2.vcf \
* -o output.vcf \
- * -genotypeMergeOptions PRIORITIZE
+ * -genotypeMergeOptions PRIORITIZE \
* -priority foo,bar
*
*
+ * Caveats
+ *
+ * - This tool is not intended to manipulate GVCFS! To combine GVCF files output by HaplotypeCaller, use CombineGVCFs.
+ * - To join intermediate VCFs produced by running jobs in parallel by interval (e.g. by chromosome), use CatVariants.
+ *
+ *
+ * Additional notes
+ *
+ * - Using this tool's multi-threaded parallelism capability is particularly useful
+ * when converting from VCF to BCF2, which can be time-consuming. In this case each thread spends CPU time
+ * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
+ * efficiently. However, since this merge runs in only one thread, you can quickly reach diminishing
+ * returns with the number of parallel threads. In our hands, `-nt 4` works well but `-nt 8` tends to be be too much.
+ * - Since GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of the first
+ * VCF record with a non-MISSING QUAL value. The previous behavior was to take the max QUAL, which could result
+ * in strange downstream confusion
+ *
+ *
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-50,stop=50))