Merge pull request #798 from broadinstitute/rhl_split_n_cigar_reads_exception

Rhl split n cigar reads exception
This commit is contained in:
Geraldine Van der Auwera 2015-01-15 00:20:49 -05:00
commit 7102477168
5 changed files with 79 additions and 46 deletions

View File

@ -78,6 +78,7 @@ import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
import java.io.FileNotFoundException;
import java.util.Collections;
import java.util.List;
import java.util.ArrayList;
/**
*
@ -141,8 +142,7 @@ public class SplitNCigarReads extends ReadWalker<GATKSAMRecord, OverhangFixingMa
* It will emit reads to the underlying writer as needed so we don't need to worry about any of that in this class.
*/
protected OverhangFixingManager overhangManager;
private List<RNAReadTransformer> rnaReadTransformers = Collections.emptyList();
private List<RNAReadTransformer> rnaReadTransformers = new ArrayList<>();
@Override

View File

@ -91,6 +91,14 @@ public class SplitNCigarReadsIntegrationTest extends WalkerTest {
executeTest("test splits with overhangs", spec);
}
@Test
public void testSplitsFixNDN() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T SplitNCigarReads -R " + b37KGReference + " -I " + privateTestDir + "splitNCigarReadsSnippet.bam -o %s --no_pg_tag -U ALLOW_N_CIGAR_READS -fixNDN", 1,
Arrays.asList("4ee1c1a64847e2b2f660a3a86f9d7e32"));
executeTest("test fix NDN", spec);
}
@Test
public void testSplitsWithOverhangsNotClipping() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(

View File

@ -244,11 +244,14 @@ public class GATKArgumentCollection {
//
// --------------------------------------------------------------------------------------------------------------
/**
* This flag tells GATK to refactor cigar string with NDN elements to one element. It intended primarily for use in
* a RNAseq pipeline since the problem might come up when using RNAseq aligner such as Tophat2 with provided transcriptoms.
* You should only use this if you know that your reads have that problem.
* Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS
* when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. These
* rarely happen, but when they do they cause GATK to fail with an error. Setting this flag tells the GATK to
* reduce "NDN" to a simpler CIGAR representation with one N element (with total length of the three refactored
* elements). From the point of view of variant calling, there is no meaningful difference between the two
* representations.
*/
@Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "refactor cigar string with NDN elements to one element", required = false)
@Argument(fullName = "refactor_NDN_cigar_string", shortName = "fixNDN", doc = "Reduce NDN elements in CIGAR string", required = false)
public boolean REFACTOR_NDN_CIGAR_READS = false;
// --------------------------------------------------------------------------------------------------------------

View File

@ -37,19 +37,24 @@ import org.broadinstitute.gatk.utils.exceptions.UserException;
import org.broadinstitute.gatk.utils.sam.GATKSAMRecord;
/**
* A read transformer that refactor NDN cigar elements to one N element.
* Reduce NDN cigar elements to one N element.
*
* <p>
* This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length of the three refactored elements).
* This is intended primarily for users of RNA-Seq data handling programs such as TopHat2.
* Currently we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
* those specific reads, users of TopHat and other tools can circumvent this problem without affecting the rest of their dataset.
* <p>This read transformer will refactor cigar strings that contain N-D-N elements to one N element (with total length
* of the three refactored elements). The engine parameter that activate this read transformer is
* `--refactor_NDN_cigar_string` / `-fixNDN`</p>
*
* NOTE: any walker that need that functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.
*
* The engine parameter that activate this read transformer is --refactor_NDN_cigar_string or -fixNDN
* </p>
* <h3>Rationale</h3>
* <p>Some RNAseq aligners that use a known transcriptome resource (such as TopHat2) produce NDN elements in read CIGARS
* when a small exon is entirely deleted during transcription, which ends up looking like [exon1]NDN[exon3]. Currently
* we consider that the internal N-D-N motif is illegal and we error out when we encounter it. By refactoring the cigar string of
* those specific reads, this read transformer allows users of TopHat and other tools to circumvent this problem without
* affecting the rest of their dataset. From the point of view of variant calling, there is no meaningful difference between
* the two representations.</p>
*
* <h3>Developer notes</h3>
* <ul>
* <li>Any walker that needs to apply this functionality should apply that read transformer in its map function, since it won't be activated by the GATK engine.</li>
* </ul>
*
*
* @author ami

View File

@ -52,41 +52,36 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import java.util.*;
/**
* Combines VCF records from different sources.
* Combine variant records from different sources
*
* <p>
* CombineVariants combines VCF records from different sources. Any (unique) name can be used to bind your rod data
* and any number of sources can be input. This tool currently supports two different combination types for each of
* variants (the first 8 fields of the VCF) and genotypes (the rest).
* Merge: combines multiple records into a single one; if sample names overlap then they are uniquified.
* Union: assumes each rod represents the same set of samples (although this is not enforced); using the
* priority list (if provided), it emits a single record instance at every position represented in the rods.
* <p>CombineVariants reads in variants records from separate ROD (Reference-Ordered Data) sources and combines them into
* a single VCF. Any (unique) name can be used to bind your ROD and any number of sources can be input. This tool aims
* to fulfill two main possible use cases, reflected by the two combination options (MERGE and UNION), for merging
* records at the variant level (the first 8 fields of the VCF) or at the genotype level (the rest).</p>
*
* CombineVariants will include a record at every site in all of your input VCF files, and annotate which input ROD
* bindings the record is present, pass, or filtered in in the set attribute in the INFO field. In effect,
* CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the N merged VCFs
* can be exacted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just
* <ul>
* <li><b>MERGE:</b> combines multiple variant records present at the same site in the different input sources into a
* single variant record in the output. If sample names overlap, then they are "uniquified" by default, which means a
* suffix is appended to make them unique. <em>Note that in version 3.3, the automatic uniquifying was disabled
* (unintentionally), and required setting `-genotypeMergeOptions UNIQUIFY` manually.</em></li>
*
* <li><b>UNION:</b> assumes that each ROD source represents the same set of samples (although this is not enforced).
* It uses the priority list (if provided) to emit a single record instance at every position represented in the input RODs.</li>
* </ul>
*
* <p>CombineVariants will emit a record for every site that was present in any of your input VCF files, and will annotate
* (in the set attribute in the INFO field) whether the record had a PASS or FILTER status in each input ROD . In effect,
* CombineVariants always produces a union of the input VCFs. However, any part of the Venn of the merged VCFs
* can be extracted using JEXL expressions on the set attribute using SelectVariants. If you want to extract just
* the records in common between two VCFs, you would first run CombineVariants on the two files to generate a single
* VCF and then run SelectVariants to extract the common records with -select 'set == "Intersection"', as worked out
* in the detailed example in the documentation guide.
*
* Note that CombineVariants supports multi-threaded parallelism (8/15/12). This is particularly useful
* when converting from VCF to BCF2, which can be expensive. In this case each thread spends CPU time
* doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
* efficiency. However, since this merge runs in only one thread, you can quickly reach diminishing
* returns with the number of parallel threads. -nt 4 works well but -nt 8 may be too much.
*
* Some fine details about the merging algorithm:
* <ul>
* <li> As of GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of
* the first VCF record with a non-MISSING QUAL value. The previous behavior was to take the
* max QUAL, which resulted in sometime strange downstream confusion</li>
* </ul>
* VCF and then run SelectVariants to extract the common records with `-select 'set == "Intersection"'`, as worked out
* in the detailed example in the documentation guide.</p>
*
* <h3>Input</h3>
* <p>
* One or more variant sets to combine.
* Two or more variant sets to combine.
* </p>
*
* <h3>Output</h3>
@ -95,6 +90,8 @@ import java.util.*;
* </p>
*
* <h3>Examples</h3>
* &nbsp;
* <h4>Merge two separate callsets</h4>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
@ -103,17 +100,37 @@ import java.util.*;
* --variant input2.vcf \
* -o output.vcf \
* -genotypeMergeOptions UNIQUIFY
*
* </pre>
* <h4>Get the union of calls made on the same samples </h4>
* <pre>
* java -Xmx2g -jar GenomeAnalysisTK.jar \
* -R ref.fasta \
* -T CombineVariants \
* --variant:foo input1.vcf \
* --variant:bar input2.vcf \
* -o output.vcf \
* -genotypeMergeOptions PRIORITIZE
* -genotypeMergeOptions PRIORITIZE \
* -priority foo,bar
* </pre>
*
* <h3>Caveats</h3>
* <ul>
* <li>This tool is not intended to manipulate GVCFS! To combine GVCF files output by HaplotypeCaller, use CombineGVCFs.</li>
* <li>To join intermediate VCFs produced by running jobs in parallel by interval (e.g. by chromosome), use CatVariants.</li>
* </ul>
*
* <h3>Additional notes</h3>
* <ul>
* <li> Using this tool's multi-threaded parallelism capability is particularly useful
* when converting from VCF to BCF2, which can be time-consuming. In this case each thread spends CPU time
* doing the conversion, and the GATK engine is smart enough to merge the partial BCF2 blocks together
* efficiently. However, since this merge runs in only one thread, you can quickly reach diminishing
* returns with the number of parallel threads. In our hands, `-nt 4` works well but `-nt 8` tends to be be too much.</li>
* <li>Since GATK 2.1, when merging multiple VCF records at a site, the combined VCF record has the QUAL of the first
* VCF record with a non-MISSING QUAL value. The previous behavior was to take the max QUAL, which could result
* in strange downstream confusion</li>
* </ul>
*
*/
@DocumentedGATKFeature( groupName = HelpConstants.DOCS_CAT_VARMANIP, extraDocs = {CommandLineGATK.class} )
@Reference(window=@Window(start=-50,stop=50))