Documentation and code cleanup for ClipReads, CallableLoci, and VariantsToTable

-- Swapped -o [summary] and -ob [bam] for more standard -o [bam] and -os [summary] arguments.
-- @Advanced arguments
This commit is contained in:
Mark DePristo 2011-08-19 15:06:37 -04:00
parent 49e831a13b
commit b08d63a6b8
5 changed files with 154 additions and 70 deletions

View File

@ -30,7 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
@ -42,7 +44,6 @@ import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation;
import org.broadinstitute.sting.utils.clipreads.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.yaml.snakeyaml.events.SequenceStartEvent;
import java.io.File;
import java.io.PrintStream;
@ -51,102 +52,158 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This tool provides simple, powerful read clipping capabilities.
* This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.
*
*
* <p>
* It allows the user to clip bases in reads
* with poor quality scores, that match particular sequences, or that were generated by particular machine cycles.
* It allows the user to clip bases in reads with poor quality scores, that match particular
* sequences, or that were generated by particular machine cycles.
*
* <dl>
* <dt>Quality score based clipping</dt>
* <dd>
* Clip bases from the read in clipper from
* <br>argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)</br>
* to the end of the read. This is blatantly stolen from BWA.
*
* Walk through the read from the end (in machine cycle order) to the beginning, calculating the
* running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this
* sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the
* clipping index in the read (from the end).
* </dd>
* <dt>Cycle based clipping</dt>
* <dd>Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
* For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions).
* For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.
* </dd>
* <dt>Sequence matching</dt>
* <dd>Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm,
* filtering only bases whose sequence exactly matches SEQ.</dd>
* </dl>
*
* </p>
*
* <h2>Input</h2>
* <p>
* A BAM file containing.
* Any number of BAM files.
* </p>
*
* <h2>Output</h2>
* <p>
* <ul>
* <li>-o: a OutputFormatted (recommended BED) file with the callable status covering each base</li>
* <li>-summary: a table of callable status x count of all examined bases</li>
* </ul>
* A new BAM file containing all of the reads from the input BAMs with the user-specified clipping
* operation applied to each read.
* </p>
* <p>
* <h3>Summary output</h3>
* <pre>
* Number of examined reads 13
* Number of clipped reads 13
* Percent of clipped reads 100.00
* Number of examined bases 988
* Number of clipped bases 126
* Percent of clipped bases 12.75
* Number of quality-score clipped bases 126
* Number of range clipped bases 0
* Number of sequence clipped bases 0
* </pre>
* </p>
*
* <p>
* <h3>Example clipping</h3>
* Suppose we are given this read:
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
*
* If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:
*
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
* NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
*
* Whereas with -CR WRITE_Q0S:
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
*
* Or -CR SOFTCLIP_BASES:
* <pre>
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3133 29 17S59M * * *
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
* </pre>
* </p>
*
* <h2>Examples</h2>
* <pre>
* -T CallableLociWalker \
* -I my.bam \
* -summary my.summary \
* -o my.bed
* -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
* -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
* </pre>
*
* would produce a BED file (my.bed) that looks like:
*
* <pre>
* 20 10000000 10000864 CALLABLE
* 20 10000865 10000985 POOR_MAPPING_QUALITY
* 20 10000986 10001138 CALLABLE
* 20 10001139 10001254 POOR_MAPPING_QUALITY
* 20 10001255 10012255 CALLABLE
* 20 10012256 10012259 POOR_MAPPING_QUALITY
* 20 10012260 10012263 CALLABLE
* 20 10012264 10012328 POOR_MAPPING_QUALITY
* 20 10012329 10012550 CALLABLE
* 20 10012551 10012551 LOW_COVERAGE
* 20 10012552 10012554 CALLABLE
* 20 10012555 10012557 LOW_COVERAGE
* 20 10012558 10012558 CALLABLE
* et cetera...
* </pre>
* as well as a summary table that looks like:
*
* <pre>
* state nBases
* REF_N 0
* CALLABLE 996046
* NO_COVERAGE 121
* LOW_COVERAGE 928
* EXCESSIVE_COVERAGE 0
* POOR_MAPPING_QUALITY 2906
* </pre>
*
* @author Mark DePristo
* @since May 7, 2010
* @since 2010
*/
@Requires({DataSource.READS})
public class ClipReadsWalker extends ReadWalker<ClipReadsWalker.ReadClipperWithData, ClipReadsWalker.ClippingData> {
@Output
PrintStream out;
/**
* If provided, ClipReads will write summary statistics about the clipping operations applied
* to the reads to this file.
*/
@Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false)
PrintStream out = null;
/**
* an optional argument to dump the reads out to a BAM file
* The output SAM/BAM file will be written here
*/
@Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false)
StingSAMFileWriter outputBam = null;
@Output(doc = "Write BAM output here", required = true)
StingSAMFileWriter outputBam;
@Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false)
/**
* If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this
* quality score threshold.
*/
@Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "If provided, the Q-score clipper will be applied", required = false)
int qTrimmingThreshold = -1;
@Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false)
/**
* Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
* For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based
* values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11,
* and 12.
*/
@Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String indicating machine cycles to clip from the reads", required = false)
String cyclesToClipArg = null;
@Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false)
/**
* Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the
* sequences in the file.
*/
@Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching the sequences in this FASTA file", required = false)
String clipSequenceFile = null;
/**
* Clips bases from the reads matching the provided SEQ. Can be provided any number of times on the command line
*/
@Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false)
String[] clipSequencesArgs = null;
@Argument(fullName="read", doc="", required=false)
String onlyDoRead = null;
//@Argument(fullName = "keepCompletelyClipped", shortName = "KCC", doc = "Unfortunately, sometimes a read is completely clipped away but with SOFTCLIP_BASES this results in an invalid CIGAR string. ", required = false)
//boolean keepCompletelyClippedReads = false;
// @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false)
// boolean onlyClipFirstSeqMatch = false;
/**
* The different values for this argument determines how ClipReads applies clips to the reads. This can range
* from writing Ns over the clipped bases to hard clipping away the bases from the BAM.
*/
@Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false)
ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS;
@Hidden
@Advanced
@Argument(fullName="read", doc="", required=false)
String onlyDoRead = null;
/**
* List of sequence that should be clipped from the reads

View File

@ -22,6 +22,7 @@
package org.broadinstitute.sting.gatk.walkers.coverage;
import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@ -154,6 +155,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
* If the number of QC+ bases (on reads with MAPQ > minMappingQuality and with base quality > minBaseQuality) exceeds this
* value and is less than maxDepth the site is considered CALLABLE.
*/
@Advanced
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
int minDepth = 4;
@ -168,6 +170,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
* won't assign a site to the POOR_MAPPING_QUALITY state unless there are at least minDepthForLowMAPQ reads
* covering the site.
*/
@Advanced
@Argument(fullName = "minDepthForLowMAPQ", shortName = "mdflmq", doc = "Minimum read depth before a locus is considered a potential candidate for poorly mapped", required = false)
int minDepthLowMAPQ = 10;
@ -181,6 +184,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
/**
* The output of this walker will be written in this format. The recommended option is BED.
*/
@Advanced
@Argument(fullName = "format", shortName = "format", doc = "Output format", required = false)
OutputFormat outputFormat;

View File

@ -105,12 +105,14 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
* Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
*/
@Advanced
@Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
public boolean showFiltered = false;
/**
* If provided, then this tool will exit with success after this number of records have been emitted to the file.
*/
@Advanced
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
public int MAX_RECORDS = -1;
int nRecords = 0;
@ -121,6 +123,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
* can make your resulting file unreadable and malformated according to tools like R, as the representation of
* multi-allelic INFO field values can be lists of values.
*/
@Advanced
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
public boolean keepMultiAllelic = false;
@ -131,6 +134,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
* fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
* will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
*/
@Advanced
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
public boolean ALLOW_MISSING_DATA = false;

View File

@ -4,9 +4,28 @@ package org.broadinstitute.sting.utils.clipreads;
* How should we represent a clipped bases in a read?
*/
public enum ClippingRepresentation {
WRITE_NS, // change the bases to Ns
WRITE_Q0S, // change the quality scores to Q0
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
SOFTCLIP_BASES, // change cigar string to S, but keep bases
HARDCLIP_BASES // remove the bases from the read
/** Clipped bases are changed to Ns */
WRITE_NS,
/** Clipped bases are changed to have Q0 quality score */
WRITE_Q0S,
/** Clipped bases are change to have both an N base and a Q0 quality score */
WRITE_NS_Q0S,
/**
* Change the read's cigar string to soft clip (S, see sam-spec) away the bases.
* Note that this can only be applied to cases where the clipped bases occur
* at the start or end of a read.
*/
SOFTCLIP_BASES,
/**
* Change the read's cigar string to hard clip (H, see sam-spec) away the bases.
* Hard clipping, unlike soft clipping, actually removes bases from the read,
* reducing the resulting file's size but introducing an irrevesible (i.e.,
* lossy) operation. Note that this can only be applied to cases where the clipped
* bases occur at the start or end of a read.
*/
HARDCLIP_BASES
}

View File

@ -37,8 +37,8 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
"-R " + hg18Reference +
" -T ClipReads " +
"-I " + validationDataLocation + "clippingReadsTest.bam " +
"-o %s " +
"-ob %s " + args,
"-os %s " +
"-o %s " + args,
2, // just one output file
Arrays.asList("tmp", "bam"),
Arrays.asList(md51, md52));
@ -72,7 +72,7 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
" -I " + validationDataLocation + "originalQuals.chr1.1-1K.bam" +
" -L chr1:1-1,000" +
" -OQ -QT 4 -CR WRITE_Q0S" +
" -o %s -ob %s",
" -o %s -os %s",
2,
Arrays.asList("55c01ccc2e84481b22d3632cdb06c8ba", "22db22749f811d30216215e047461621"));
executeTest("clipOriginalQuals", spec);