Documentation and code cleanup for ClipReads, CallableLoci, and VariantsToTable
-- Swapped -o [summary] and -ob [bam] for more standard -o [bam] and -os [summary] arguments. -- @Advanced arguments
This commit is contained in:
parent
49e831a13b
commit
b08d63a6b8
|
|
@ -30,7 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile;
|
|||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
|
|
@ -42,7 +44,6 @@ import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation;
|
|||
import org.broadinstitute.sting.utils.clipreads.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.yaml.snakeyaml.events.SequenceStartEvent;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
|
|
@ -51,102 +52,158 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* This tool provides simple, powerful read clipping capabilities.
|
||||
* This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.
|
||||
*
|
||||
*
|
||||
* <p>
|
||||
* It allows the user to clip bases in reads
|
||||
* with poor quality scores, that match particular sequences, or that were generated by particular machine cycles.
|
||||
* It allows the user to clip bases in reads with poor quality scores, that match particular
|
||||
* sequences, or that were generated by particular machine cycles.
|
||||
*
|
||||
* <dl>
|
||||
* <dt>Quality score based clipping</dt>
|
||||
* <dd>
|
||||
* Clip bases from the read in clipper from
|
||||
* <br>argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)</br>
|
||||
* to the end of the read. This is blatantly stolen from BWA.
|
||||
*
|
||||
* Walk through the read from the end (in machine cycle order) to the beginning, calculating the
|
||||
* running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this
|
||||
* sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the
|
||||
* clipping index in the read (from the end).
|
||||
* </dd>
|
||||
* <dt>Cycle based clipping</dt>
|
||||
* <dd>Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
|
||||
* For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions).
|
||||
* For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.
|
||||
* </dd>
|
||||
* <dt>Sequence matching</dt>
|
||||
* <dd>Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm,
|
||||
* filtering only bases whose sequence exactly matches SEQ.</dd>
|
||||
* </dl>
|
||||
*
|
||||
* </p>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* A BAM file containing.
|
||||
* Any number of BAM files.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* <ul>
|
||||
* <li>-o: a OutputFormatted (recommended BED) file with the callable status covering each base</li>
|
||||
* <li>-summary: a table of callable status x count of all examined bases</li>
|
||||
* </ul>
|
||||
* A new BAM file containing all of the reads from the input BAMs with the user-specified clipping
|
||||
* operation applied to each read.
|
||||
* </p>
|
||||
* <p>
|
||||
* <h3>Summary output</h3>
|
||||
* <pre>
|
||||
* Number of examined reads 13
|
||||
* Number of clipped reads 13
|
||||
* Percent of clipped reads 100.00
|
||||
* Number of examined bases 988
|
||||
* Number of clipped bases 126
|
||||
* Percent of clipped bases 12.75
|
||||
* Number of quality-score clipped bases 126
|
||||
* Number of range clipped bases 0
|
||||
* Number of sequence clipped bases 0
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* <h3>Example clipping</h3>
|
||||
* Suppose we are given this read:
|
||||
* <pre>
|
||||
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
|
||||
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
|
||||
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
|
||||
* </pre>
|
||||
*
|
||||
* If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:
|
||||
*
|
||||
* <pre>
|
||||
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
|
||||
* NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
|
||||
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
|
||||
* </pre>
|
||||
*
|
||||
* Whereas with -CR WRITE_Q0S:
|
||||
* <pre>
|
||||
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
|
||||
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
|
||||
* !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
|
||||
* </pre>
|
||||
*
|
||||
* Or -CR SOFTCLIP_BASES:
|
||||
* <pre>
|
||||
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3133 29 17S59M * * *
|
||||
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
|
||||
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* -T CallableLociWalker \
|
||||
* -I my.bam \
|
||||
* -summary my.summary \
|
||||
* -o my.bed
|
||||
* -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
|
||||
* -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
|
||||
* </pre>
|
||||
*
|
||||
* would produce a BED file (my.bed) that looks like:
|
||||
*
|
||||
* <pre>
|
||||
* 20 10000000 10000864 CALLABLE
|
||||
* 20 10000865 10000985 POOR_MAPPING_QUALITY
|
||||
* 20 10000986 10001138 CALLABLE
|
||||
* 20 10001139 10001254 POOR_MAPPING_QUALITY
|
||||
* 20 10001255 10012255 CALLABLE
|
||||
* 20 10012256 10012259 POOR_MAPPING_QUALITY
|
||||
* 20 10012260 10012263 CALLABLE
|
||||
* 20 10012264 10012328 POOR_MAPPING_QUALITY
|
||||
* 20 10012329 10012550 CALLABLE
|
||||
* 20 10012551 10012551 LOW_COVERAGE
|
||||
* 20 10012552 10012554 CALLABLE
|
||||
* 20 10012555 10012557 LOW_COVERAGE
|
||||
* 20 10012558 10012558 CALLABLE
|
||||
* et cetera...
|
||||
* </pre>
|
||||
* as well as a summary table that looks like:
|
||||
*
|
||||
* <pre>
|
||||
* state nBases
|
||||
* REF_N 0
|
||||
* CALLABLE 996046
|
||||
* NO_COVERAGE 121
|
||||
* LOW_COVERAGE 928
|
||||
* EXCESSIVE_COVERAGE 0
|
||||
* POOR_MAPPING_QUALITY 2906
|
||||
* </pre>
|
||||
*
|
||||
|
||||
* @author Mark DePristo
|
||||
* @since May 7, 2010
|
||||
* @since 2010
|
||||
*/
|
||||
@Requires({DataSource.READS})
|
||||
public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithData, ClipReadsWalker.ClippingData> {
|
||||
@Output
|
||||
PrintStream out;
|
||||
/**
|
||||
* If provided, ClipReads will write summary statistics about the clipping operations applied
|
||||
* to the reads to this file.
|
||||
*/
|
||||
@Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false)
|
||||
PrintStream out = null;
|
||||
|
||||
/**
|
||||
* an optional argument to dump the reads out to a BAM file
|
||||
* The output SAM/BAM file will be written here
|
||||
*/
|
||||
@Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false)
|
||||
StingSAMFileWriter outputBam = null;
|
||||
@Output(doc = "Write BAM output here", required = true)
|
||||
StingSAMFileWriter outputBam;
|
||||
|
||||
@Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false)
|
||||
/**
|
||||
* If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this
|
||||
* quality score threshold.
|
||||
*/
|
||||
@Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "If provided, the Q-score clipper will be applied", required = false)
|
||||
int qTrimmingThreshold = -1;
|
||||
|
||||
@Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false)
|
||||
/**
|
||||
* Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
|
||||
* For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based
|
||||
* values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11,
|
||||
* and 12.
|
||||
*/
|
||||
@Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String indicating machine cycles to clip from the reads", required = false)
|
||||
String cyclesToClipArg = null;
|
||||
|
||||
@Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false)
|
||||
/**
|
||||
* Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the
|
||||
* sequences in the file.
|
||||
*/
|
||||
@Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching the sequences in this FASTA file", required = false)
|
||||
String clipSequenceFile = null;
|
||||
|
||||
/**
|
||||
* Clips bases from the reads matching the provided SEQ. Can be provided any number of times on the command line
|
||||
*/
|
||||
@Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false)
|
||||
String[] clipSequencesArgs = null;
|
||||
|
||||
@Argument(fullName="read", doc="", required=false)
|
||||
String onlyDoRead = null;
|
||||
|
||||
//@Argument(fullName = "keepCompletelyClipped", shortName = "KCC", doc = "Unfortunately, sometimes a read is completely clipped away but with SOFTCLIP_BASES this results in an invalid CIGAR string. ", required = false)
|
||||
//boolean keepCompletelyClippedReads = false;
|
||||
|
||||
// @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false)
|
||||
// boolean onlyClipFirstSeqMatch = false;
|
||||
|
||||
/**
|
||||
* The different values for this argument determines how ClipReads applies clips to the reads. This can range
|
||||
* from writing Ns over the clipped bases to hard clipping away the bases from the BAM.
|
||||
*/
|
||||
@Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false)
|
||||
ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS;
|
||||
|
||||
@Hidden
|
||||
@Advanced
|
||||
@Argument(fullName="read", doc="", required=false)
|
||||
String onlyDoRead = null;
|
||||
|
||||
/**
|
||||
* List of sequence that should be clipped from the reads
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.coverage;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
|
|
@ -154,6 +155,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
|
|||
* If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
|
||||
* value and is less than maxDepth the site is considered CALLABLE.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
|
||||
int minDepth = 4;
|
||||
|
||||
|
|
@ -168,6 +170,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
|
|||
* won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads
|
||||
* covering the site.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false)
|
||||
int minDepthLowMAPQ = 10;
|
||||
|
||||
|
|
@ -181,6 +184,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
|
|||
/**
|
||||
* The output of this walker will be written in this format. The recommended option is BED.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
|
||||
OutputFormat outputFormat;
|
||||
|
||||
|
|
|
|||
|
|
@ -105,12 +105,14 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
|
||||
* Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
|
||||
public boolean showFiltered = false;
|
||||
|
||||
/**
|
||||
* If provided, then this tool will exit with success after this number of records have been emitted to the file.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
|
||||
public int MAX_RECORDS = -1;
|
||||
int nRecords = 0;
|
||||
|
|
@ -121,6 +123,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
* can make your resulting file unreadable and malformated according to tools like R, as the representation of
|
||||
* multi-allelic INFO field values can be lists of values.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
|
||||
public boolean keepMultiAllelic = false;
|
||||
|
||||
|
|
@ -131,6 +134,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
|
|||
* fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
|
||||
* will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
|
||||
public boolean ALLOW_MISSING_DATA = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -4,9 +4,28 @@ package org.broadinstitute.sting.utils.clipreads;
|
|||
* How should we represent a clipped bases in a read?
|
||||
*/
|
||||
public enum ClippingRepresentation {
|
||||
WRITE_NS, // change the bases to Ns
|
||||
WRITE_Q0S, // change the quality scores to Q0
|
||||
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
|
||||
SOFTCLIP_BASES, // change cigar string to S, but keep bases
|
||||
HARDCLIP_BASES // remove the bases from the read
|
||||
/** Clipped bases are changed to Ns */
|
||||
WRITE_NS,
|
||||
|
||||
/** Clipped bases are changed to have Q0 quality score */
|
||||
WRITE_Q0S,
|
||||
|
||||
/** Clipped bases are change to have both an N base and a Q0 quality score */
|
||||
WRITE_NS_Q0S,
|
||||
|
||||
/**
|
||||
* Change the read's cigar string to soft clip (S, see sam-spec) away the bases.
|
||||
* Note that this can only be applied to cases where the clipped bases occur
|
||||
* at the start or end of a read.
|
||||
*/
|
||||
SOFTCLIP_BASES,
|
||||
|
||||
/**
|
||||
* Change the read's cigar string to hard clip (H, see sam-spec) away the bases.
|
||||
* Hard clipping, unlike soft clipping, actually removes bases from the read,
|
||||
* reducing the resulting file's size but introducing an irrevesible (i.e.,
|
||||
* lossy) operation. Note that this can only be applied to cases where the clipped
|
||||
* bases occur at the start or end of a read.
|
||||
*/
|
||||
HARDCLIP_BASES
|
||||
}
|
||||
|
|
|
|||
|
|
@ -37,8 +37,8 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
|
|||
"-R " + hg18Reference +
|
||||
" -T ClipReads " +
|
||||
"-I " + validationDataLocation + "clippingReadsTest.bam " +
|
||||
"-o %s " +
|
||||
"-ob %s " + args,
|
||||
"-os %s " +
|
||||
"-o %s " + args,
|
||||
2, // just one output file
|
||||
Arrays.asList("tmp", "bam"),
|
||||
Arrays.asList(md51, md52));
|
||||
|
|
@ -72,7 +72,7 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
|
|||
" -I " + validationDataLocation + "originalQuals.chr1.1-1K.bam" +
|
||||
" -L chr1:1-1,000" +
|
||||
" -OQ -QT 4 -CR WRITE_Q0S" +
|
||||
" -o %s -ob %s",
|
||||
" -o %s -os %s",
|
||||
2,
|
||||
Arrays.asList("55c01ccc2e84481b22d3632cdb06c8ba", "22db22749f811d30216215e047461621"));
|
||||
executeTest("clipOriginalQuals", spec);
|
||||
|
|
|
|||
Loading…
Reference in New Issue