Fix SplitNCigar reads exception by making the list of RNAReadTransformer non-abstract, add test for -fixNDN
Includes documentation changes for -fixNDN argument and the read transformer documentation. Documentation changes to CombineVariants
This commit is contained in:
parent
0292d49842
commit
804b2a36b7
|
|
@ -78,6 +78,7 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
|
|||
import java.io.FileNotFoundException;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -141,8 +142,7 @@ public class SplitNCigarReads extends ReadWalker<GATKSAMRecord, OverhangFixingMa
|
|||
* It will emit reads to the underlying writer as needed so we don't need to worry about any of that in this class.
|
||||
*/
|
||||
protected OverhangFixingManager overhangManager;
|
||||
private List<RNAReadTransformer> rnaReadTransformers = Collections.emptyList();
|
||||
|
||||
private List<RNAReadTransformer> rnaReadTransformers = new ArrayList<>();
|
||||
|
||||
|
||||
@Override
|
||||
|
|
|
|||
|
|
@ -91,6 +91,14 @@ public class SplitNCigarReadsIntegrationTest extends WalkerTest {
|
|||
executeTest("test splits with overhangs", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSplitsFixNDN() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "splitNCigarReadsSnippet.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS -fixNDN", 1,
|
||||
Arrays.asList("4ee1c1a64847e2b2f660a3a86f9d7e32"));
|
||||
executeTest("test fix NDN", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSplitsWithOverhangsNotClipping() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
|
|
|
|||
|
|
@ -244,11 +244,14 @@ public class GATKArgumentCollection {
|
|||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
/**
|
||||
* This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in
|
||||
* a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms.
|
||||
* You should only use this if you know that your reads have that problem.
|
||||
* Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS
|
||||
* when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. These
|
||||
* rarely happen, but when they do they cause GATK to fail with an error. Setting this flag tells the GATK to
|
||||
* reduce "NDN" to a simpler CIGAR representation with one N element (with total length of the three refactored
|
||||
* elements). From the point of view of variant calling, there is no meaningful difference between the two
|
||||
* representations.
|
||||
*/
|
||||
@Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false)
|
||||
@Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "Reduce NDN elements in CIGAR string", required = false)
|
||||
public boolean REFACTOR_NDN_CIGAR_READS = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -37,19 +37,24 @@ import org.broadinstitute.gatk.utils.exceptions.UserException;
|
|||
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
|
||||
|
||||
/**
|
||||
* A read transformer that refactor NDN cigar elements to one N element.
|
||||
* Reduce NDN cigar elements to one N element.
|
||||
*
|
||||
* <p>
|
||||
* This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements).
|
||||
* This is intended primarily for users of RNA-Seq data handling programs such as TopHat2.
|
||||
* Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
|
||||
* those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset.
|
||||
* <p>This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length
|
||||
* of the three refactored elements). The engine parameter that activate this read transformer is
|
||||
* `--refactor_NDN_cigar_string` / `-fixNDN`</p>
|
||||
*
|
||||
* NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
|
||||
*
|
||||
* The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN
|
||||
* </p>
|
||||
* <h3>Rationale</h3>
|
||||
* <p>Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS
|
||||
* when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. Currently
|
||||
* we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
|
||||
* those specific reads, this read transformer allows users of TopHat and other tools to circumvent this problem without
|
||||
* affecting the rest of their dataset. From the point of view of variant calling, there is no meaningful difference between
|
||||
* the two representations.</p>
|
||||
*
|
||||
* <h3>Developer notes</h3>
|
||||
* <ul>
|
||||
* <li>Any walker that needs to apply this functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.</li>
|
||||
* </ul>
|
||||
*
|
||||
*
|
||||
* @author ami
|
||||
|
|
|
|||
|
|
@ -52,41 +52,36 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter;
|
|||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* Combines VCF records from different sources.
|
||||
* Combine variant records from different sources
|
||||
*
|
||||
* <p>
|
||||
* CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data
|
||||
* and any number of sources can be input. This tool currently supports two different combination types for each of
|
||||
* variants (the first 8 fields of the VCF) and genotypes (the rest).
|
||||
* Merge: combines multiple records into a single one; if sample names overlap then they are uniquified.
|
||||
* Union: assumes each rod represents the same set of samples (although this is not enforced); using the
|
||||
* priority list (if provided), it emits a single record instance at every position represented in the rods.
|
||||
* <p>CombineVariants reads in variants records from separate ROD (Reference-Ordered Data) sources and combines them into
|
||||
* a single VCF. Any (unique) name can be used to bind your ROD and any number of sources can be input. This tool aims
|
||||
* to fulfill two main possible use cases, reflected by the two combination options (MERGE and UNION), for merging
|
||||
* records at the variant level (the first 8 fields of the VCF) or at the genotype level (the rest).</p>
|
||||
*
|
||||
* CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD
|
||||
* bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect,
|
||||
* CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs
|
||||
* can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just
|
||||
* <ul>
|
||||
* <li><b>MERGE:</b> combines multiple variant records present at the same site in the different input sources into a
|
||||
* single variant record in the output. If sample names overlap, then they are "uniquified" by default, which means a
|
||||
* suffix is appended to make them unique. <em>Note that in version 3.3, the automatic uniquifying was disabled
|
||||
* (unintentionally), and required setting `-genotypeMergeOptions UNIQUIFY` manually.</em></li>
|
||||
*
|
||||
* <li><b>UNION:</b> assumes that each ROD source represents the same set of samples (although this is not enforced).
|
||||
* It uses the priority list (if provided) to emit a single record instance at every position represented in the input RODs.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>CombineVariants will emit a record for every site that was present in any of your input VCF files, and will annotate
|
||||
* (in the set attribute in the INFO field) whether the record had a PASS or FILTER status in each input ROD . In effect,
|
||||
* CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the merged VCFs
|
||||
* can be extracted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just
|
||||
* the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single
|
||||
* VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
|
||||
* in the detailed example in the documentation guide.
|
||||
*
|
||||
* Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful
|
||||
* when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time
|
||||
* doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
|
||||
* efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing
|
||||
* returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much.
|
||||
*
|
||||
* Some fine details about the merging algorithm:
|
||||
* <ul>
|
||||
* <li> As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of
|
||||
* the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the
|
||||
* max QUAL, which resulted in sometime strange downstream confusion</li>
|
||||
* </ul>
|
||||
* VCF and then run SelectVariants to extract the common records with `-select 'set == "Intersection"'`, as worked out
|
||||
* in the detailed example in the documentation guide.</p>
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
* One or more variant sets to combine.
|
||||
* Two or more variant sets to combine.
|
||||
* </p>
|
||||
*
|
||||
* <h3>Output</h3>
|
||||
|
|
@ -95,6 +90,8 @@ import java.util.*;
|
|||
* </p>
|
||||
*
|
||||
* <h3>Examples</h3>
|
||||
*
|
||||
* <h4>Merge two separate callsets</h4>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
|
|
@ -103,17 +100,37 @@ import java.util.*;
|
|||
* --variant input2.vcf \
|
||||
* -o output.vcf \
|
||||
* -genotypeMergeOptions UNIQUIFY
|
||||
*
|
||||
* </pre>
|
||||
* <h4>Get the union of calls made on the same samples </h4>
|
||||
* <pre>
|
||||
* java -Xmx2g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T CombineVariants \
|
||||
* --variant:foo input1.vcf \
|
||||
* --variant:bar input2.vcf \
|
||||
* -o output.vcf \
|
||||
* -genotypeMergeOptions PRIORITIZE
|
||||
* -genotypeMergeOptions PRIORITIZE \
|
||||
* -priority foo,bar
|
||||
* </pre>
|
||||
*
|
||||
* <h3>Caveats</h3>
|
||||
* <ul>
|
||||
* <li>This tool is not intended to manipulate GVCFS! To combine GVCF files output by HaplotypeCaller, use CombineGVCFs.</li>
|
||||
* <li>To join intermediate VCFs produced by running jobs in parallel by interval (e.g. by chromosome), use CatVariants.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <h3>Additional notes</h3>
|
||||
* <ul>
|
||||
* <li> Using this tool's multi-threaded parallelism capability is particularly useful
|
||||
* when converting from VCF to BCF2, which can be time-consuming. In this case each thread spends CPU time
|
||||
* doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
|
||||
* efficiently. However, since this merge runs in only one thread, you can quickly reach diminishing
|
||||
* returns with the number of parallel threads. In our hands, `-nt 4` works well but `-nt 8` tends to be be too much.</li>
|
||||
* <li>Since GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of the first
|
||||
* VCF record with a non-MISSING QUAL value. The previous behavior was to take the max QUAL, which could result
|
||||
* in strange downstream confusion</li>
|
||||
* </ul>
|
||||
*
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
|
||||
@Reference(window=@Window(start=-50,stop=50))
|
||||
|
|
|
|||
Loading…
Reference in New Issue