From 804b2a36b7784040ab31d7a65aba5d8538ac56d3 Mon Sep 17 00:00:00 2001 From: Ron Levine Date: Wed, 7 Jan 2015 09:59:03 -0500 Subject: [PATCH] Fix SplitNCigar reads exception by making the list of RNAReadTransformer non-abstract, add test for -fixNDN Includes documentation changes for -fixNDN argument and the read transformer documentation. Documentation changes to CombineVariants --- .../walkers/rnaseq/SplitNCigarReads.java | 4 +- .../SplitNCigarReadsIntegrationTest.java | 8 ++ .../arguments/GATKArgumentCollection.java | 11 ++- .../filters/NDNCigarReadTransformer.java | 25 +++--- .../walkers/variantutils/CombineVariants.java | 77 +++++++++++-------- 5 files changed, 79 insertions(+), 46 deletions(-) diff --git a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java index 2e549263e..9958a4f7b 100644 --- a/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java +++ b/protected/gatk-tools-protected/src/main/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReads.java @@ -78,6 +78,7 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; import java.io.FileNotFoundException; import java.util.Collections; import java.util.List; +import java.util.ArrayList; /** * @@ -141,8 +142,7 @@ public class SplitNCigarReads extends ReadWalker rnaReadTransformers = Collections.emptyList(); - + private List rnaReadTransformers = new ArrayList<>(); @Override diff --git a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java index 6ed7b7d0b..c615315a9 100644 --- a/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java +++ b/protected/gatk-tools-protected/src/test/java/org/broadinstitute/gatk/tools/walkers/rnaseq/SplitNCigarReadsIntegrationTest.java @@ -91,6 +91,14 @@ public class SplitNCigarReadsIntegrationTest extends WalkerTest { executeTest("test splits with overhangs", spec); } + @Test + public void testSplitsFixNDN() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "splitNCigarReadsSnippet.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS -fixNDN", 1, + Arrays.asList("4ee1c1a64847e2b2f660a3a86f9d7e32")); + executeTest("test fix NDN", spec); + } + @Test public void testSplitsWithOverhangsNotClipping() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java index 575c195b4..5b6ae51f5 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/arguments/GATKArgumentCollection.java @@ -244,11 +244,14 @@ public class GATKArgumentCollection { // // -------------------------------------------------------------------------------------------------------------- /** - * This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in - * a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms. - * You should only use this if you know that your reads have that problem. + * Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS + * when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. These + * rarely happen, but when they do they cause GATK to fail with an error. Setting this flag tells the GATK to + * reduce "NDN" to a simpler CIGAR representation with one N element (with total length of the three refactored + * elements). From the point of view of variant calling, there is no meaningful difference between the two + * representations. */ - @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false) + @Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "Reduce NDN elements in CIGAR string", required = false) public boolean REFACTOR_NDN_CIGAR_READS = false; // -------------------------------------------------------------------------------------------------------------- diff --git a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java index 65bf1eb02..e01827723 100644 --- a/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java +++ b/public/gatk-engine/src/main/java/org/broadinstitute/gatk/engine/filters/NDNCigarReadTransformer.java @@ -37,19 +37,24 @@ import org.broadinstitute.gatk.utils.exceptions.UserException; import org.broadinstitute.gatk.utils.sam.GATKSAMRecord; /** - * A read transformer that refactor NDN cigar elements to one N element. + * Reduce NDN cigar elements to one N element. * - *

- * This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements). - * This is intended primarily for users of RNA-Seq data handling programs such as TopHat2. - * Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of - * those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset. + *

This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length + * of the three refactored elements). The engine parameter that activate this read transformer is + * `--refactor_NDN_cigar_string` / `-fixNDN`

* - * NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine. - * - * The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN - *

+ *

Rationale

+ *

Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS + * when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. Currently + * we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of + * those specific reads, this read transformer allows users of TopHat and other tools to circumvent this problem without + * affecting the rest of their dataset. From the point of view of variant calling, there is no meaningful difference between + * the two representations.

* + *

Developer notes

+ *
    + *
  • Any walker that needs to apply this functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
  • + *
* * * @author ami diff --git a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java index 7783b5b31..51f0c40bc 100644 --- a/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java +++ b/public/gatk-tools-public/src/main/java/org/broadinstitute/gatk/tools/walkers/variantutils/CombineVariants.java @@ -52,41 +52,36 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter; import java.util.*; + /** - * Combines VCF records from different sources. + * Combine variant records from different sources * - *

- * CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data - * and any number of sources can be input. This tool currently supports two different combination types for each of - * variants (the first 8 fields of the VCF) and genotypes (the rest). - * Merge: combines multiple records into a single one; if sample names overlap then they are uniquified. - * Union: assumes each rod represents the same set of samples (although this is not enforced); using the - * priority list (if provided), it emits a single record instance at every position represented in the rods. + *

CombineVariants reads in variants records from separate ROD (Reference-Ordered Data) sources and combines them into + * a single VCF. Any (unique) name can be used to bind your ROD and any number of sources can be input. This tool aims + * to fulfill two main possible use cases, reflected by the two combination options (MERGE and UNION), for merging + * records at the variant level (the first 8 fields of the VCF) or at the genotype level (the rest).

* - * CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD - * bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect, - * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs - * can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just + *
    + *
  • MERGE: combines multiple variant records present at the same site in the different input sources into a + * single variant record in the output. If sample names overlap, then they are "uniquified" by default, which means a + * suffix is appended to make them unique. Note that in version 3.3, the automatic uniquifying was disabled + * (unintentionally), and required setting `-genotypeMergeOptions UNIQUIFY` manually.
  • + * + *
  • UNION: assumes that each ROD source represents the same set of samples (although this is not enforced). + * It uses the priority list (if provided) to emit a single record instance at every position represented in the input RODs.
  • + *
+ * + *

CombineVariants will emit a record for every site that was present in any of your input VCF files, and will annotate + * (in the set attribute in the INFO field) whether the record had a PASS or FILTER status in each input ROD . In effect, + * CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the merged VCFs + * can be extracted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just * the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single - * VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out - * in the detailed example in the documentation guide. - * - * Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful - * when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time - * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together - * efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing - * returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much. - * - * Some fine details about the merging algorithm: - *

    - *
  • As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of - * the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the - * max QUAL, which resulted in sometime strange downstream confusion
  • - *
+ * VCF and then run SelectVariants to extract the common records with `-select 'set == "Intersection"'`, as worked out + * in the detailed example in the documentation guide.

* *

Input

*

- * One or more variant sets to combine. + * Two or more variant sets to combine. *

* *

Output

@@ -95,6 +90,8 @@ import java.util.*; *

* *

Examples

+ *   + *

Merge two separate callsets

*
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
@@ -103,17 +100,37 @@ import java.util.*;
  *   --variant input2.vcf \
  *   -o output.vcf \
  *   -genotypeMergeOptions UNIQUIFY
- *
+ * 
+ *

Get the union of calls made on the same samples

+ *
  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
  *   -T CombineVariants \
  *   --variant:foo input1.vcf \
  *   --variant:bar input2.vcf \
  *   -o output.vcf \
- *   -genotypeMergeOptions PRIORITIZE
+ *   -genotypeMergeOptions PRIORITIZE \
  *   -priority foo,bar
  * 
* + *

Caveats

+ *
    + *
  • This tool is not intended to manipulate GVCFS! To combine GVCF files output by HaplotypeCaller, use CombineGVCFs.
  • + *
  • To join intermediate VCFs produced by running jobs in parallel by interval (e.g. by chromosome), use CatVariants.
  • + *
+ * + *

Additional notes

+ *
    + *
  • Using this tool's multi-threaded parallelism capability is particularly useful + * when converting from VCF to BCF2, which can be time-consuming. In this case each thread spends CPU time + * doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together + * efficiently. However, since this merge runs in only one thread, you can quickly reach diminishing + * returns with the number of parallel threads. In our hands, `-nt 4` works well but `-nt 8` tends to be be too much.
  • + *
  • Since GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of the first + * VCF record with a non-MISSING QUAL value. The previous behavior was to take the max QUAL, which could result + * in strange downstream confusion
  • + *
+ * */ @DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=-50,stop=50))