diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index 68afed296..bb65d9b09 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -30,7 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFileFactory; import net.sf.samtools.SAMRecord; import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; @@ -42,7 +44,6 @@ import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation; import org.broadinstitute.sting.utils.clipreads.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.ReadUtils; -import org.yaml.snakeyaml.events.SequenceStartEvent; import java.io.File; import java.io.PrintStream; @@ -51,102 +52,158 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; /** - * This tool provides simple, powerful read clipping capabilities. + * This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences. + * * *

- * It allows the user to clip bases in reads - * with poor quality scores, that match particular sequences, or that were generated by particular machine cycles. + * It allows the user to clip bases in reads with poor quality scores, that match particular + * sequences, or that were generated by particular machine cycles. + * + *

+ *
Quality score based clipping
+ *
+ * Clip bases from the read in clipper from + *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
+ * to the end of the read. This is blatantly stolen from BWA. + * + * Walk through the read from the end (in machine cycle order) to the beginning, calculating the + * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this + * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the + * clipping index in the read (from the end). + *
+ *
Cycle based clipping
+ *
Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. + * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions). + * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12. + *
+ *
Sequence matching
+ *
Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm, + * filtering only bases whose sequence exactly matches SEQ.
+ *
+ * *

* *

Input

*

- * A BAM file containing. + * Any number of BAM files. *

* *

Output

*

- *

+ * A new BAM file containing all of the reads from the input BAMs with the user-specified clipping + * operation applied to each read. + *

+ *

+ *

Summary output

+ *
+ *     Number of examined reads              13
+ *     Number of clipped reads               13
+ *     Percent of clipped reads              100.00
+ *     Number of examined bases              988
+ *     Number of clipped bases               126
+ *     Percent of clipped bases              12.75
+ *     Number of quality-score clipped bases 126
+ *     Number of range clipped bases         0
+ *     Number of sequence clipped bases      0
+ *     
+ *

+ * + *

+ *

Example clipping

+ * Suppose we are given this read: + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
+ *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ * + * If we are clipping reads with -QT 10 and -CR WRITE_NS, we get: + * + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
+ *          NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ * + * Whereas with -CR WRITE_Q0S: + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3116    29      76M     *       *       *
+ *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
+ * + * Or -CR SOFTCLIP_BASES: + *
+ *     314KGAAXX090507:1:19:1420:1123#0        16      chrM    3133    29      17S59M  *       *       *
+ *          TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ *          #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *     
*

* *

Examples

*
- *     -T CallableLociWalker \
- *     -I my.bam \
- *     -summary my.summary \
- *     -o my.bed
+ *     -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
+ *     -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
  * 
- * - * would produce a BED file (my.bed) that looks like: - * - *
- *     20 10000000 10000864 CALLABLE
- *     20 10000865 10000985 POOR_MAPPING_QUALITY
- *     20 10000986 10001138 CALLABLE
- *     20 10001139 10001254 POOR_MAPPING_QUALITY
- *     20 10001255 10012255 CALLABLE
- *     20 10012256 10012259 POOR_MAPPING_QUALITY
- *     20 10012260 10012263 CALLABLE
- *     20 10012264 10012328 POOR_MAPPING_QUALITY
- *     20 10012329 10012550 CALLABLE
- *     20 10012551 10012551 LOW_COVERAGE
- *     20 10012552 10012554 CALLABLE
- *     20 10012555 10012557 LOW_COVERAGE
- *     20 10012558 10012558 CALLABLE
- *     et cetera...
- * 
- * as well as a summary table that looks like: - * - *
- *                        state nBases
- *                        REF_N 0
- *                     CALLABLE 996046
- *                  NO_COVERAGE 121
- *                 LOW_COVERAGE 928
- *           EXCESSIVE_COVERAGE 0
- *         POOR_MAPPING_QUALITY 2906
- * 
- * + * @author Mark DePristo - * @since May 7, 2010 + * @since 2010 */ @Requires({DataSource.READS}) public class ClipReadsWalker extends ReadWalker { - @Output - PrintStream out; + /** + * If provided, ClipReads will write summary statistics about the clipping operations applied + * to the reads to this file. + */ + @Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false) + PrintStream out = null; /** - * an optional argument to dump the reads out to a BAM file + * The output SAM/BAM file will be written here */ - @Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false) - StingSAMFileWriter outputBam = null; + @Output(doc = "Write BAM output here", required = true) + StingSAMFileWriter outputBam; - @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false) + /** + * If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this + * quality score threshold. + */ + @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "If provided, the Q-score clipper will be applied", required = false) int qTrimmingThreshold = -1; - @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false) + /** + * Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc. + * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based + * values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, + * and 12. + */ + @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String indicating machine cycles to clip from the reads", required = false) String cyclesToClipArg = null; - @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false) + /** + * Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the + * sequences in the file. + */ + @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching the sequences in this FASTA file", required = false) String clipSequenceFile = null; + /** + * Clips bases from the reads matching the provided SEQ. Can be provided any number of times on the command line + */ @Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false) String[] clipSequencesArgs = null; - @Argument(fullName="read", doc="", required=false) - String onlyDoRead = null; - - //@Argument(fullName = "keepCompletelyClipped", shortName = "KCC", doc = "Unfortunately, sometimes a read is completely clipped away but with SOFTCLIP_BASES this results in an invalid CIGAR string. ", required = false) - //boolean keepCompletelyClippedReads = false; - -// @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false) -// boolean onlyClipFirstSeqMatch = false; - + /** + * The different values for this argument determines how ClipReads applies clips to the reads. This can range + * from writing Ns over the clipped bases to hard clipping away the bases from the BAM. + */ @Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false) ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS; + @Hidden + @Advanced + @Argument(fullName="read", doc="", required=false) + String onlyDoRead = null; /** * List of sequence that should be clipped from the reads diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 98331ec1d..32875a098 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -22,6 +22,7 @@ package org.broadinstitute.sting.gatk.walkers.coverage; +import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; @@ -154,6 +155,7 @@ public class CallableLociWalker extends LocusWalker minMappingQuality and with base quality > minBaseQuality) exceeds this * value and is less than maxDepth the site is considered CALLABLE. */ + @Advanced @Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false) int minDepth = 4; @@ -168,6 +170,7 @@ public class CallableLociWalker extends LocusWalker { * By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered). * Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value. */ + @Advanced @Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false) public boolean showFiltered = false; /** * If provided, then this tool will exit with success after this number of records have been emitted to the file. */ + @Advanced @Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false) public int MAX_RECORDS = -1; int nRecords = 0; @@ -121,6 +123,7 @@ public class VariantsToTable extends RodWalker { * can make your resulting file unreadable and malformated according to tools like R, as the representation of * multi-allelic INFO field values can be lists of values. */ + @Advanced @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false) public boolean keepMultiAllelic = false; @@ -131,6 +134,7 @@ public class VariantsToTable extends RodWalker { * fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error. */ + @Advanced @Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false) public boolean ALLOW_MISSING_DATA = false; diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java index 14c04b5c4..0dbe55726 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java @@ -4,9 +4,28 @@ package org.broadinstitute.sting.utils.clipreads; * How should we represent a clipped bases in a read? */ public enum ClippingRepresentation { - WRITE_NS, // change the bases to Ns - WRITE_Q0S, // change the quality scores to Q0 - WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns - SOFTCLIP_BASES, // change cigar string to S, but keep bases - HARDCLIP_BASES // remove the bases from the read + /** Clipped bases are changed to Ns */ + WRITE_NS, + + /** Clipped bases are changed to have Q0 quality score */ + WRITE_Q0S, + + /** Clipped bases are change to have both an N base and a Q0 quality score */ + WRITE_NS_Q0S, + + /** + * Change the read's cigar string to soft clip (S, see sam-spec) away the bases. + * Note that this can only be applied to cases where the clipped bases occur + * at the start or end of a read. + */ + SOFTCLIP_BASES, + + /** + * Change the read's cigar string to hard clip (H, see sam-spec) away the bases. + * Hard clipping, unlike soft clipping, actually removes bases from the read, + * reducing the resulting file's size but introducing an irrevesible (i.e., + * lossy) operation. Note that this can only be applied to cases where the clipped + * bases occur at the start or end of a read. + */ + HARDCLIP_BASES } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java index a129f8adf..ca3d1ee25 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java @@ -37,8 +37,8 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { "-R " + hg18Reference + " -T ClipReads " + "-I " + validationDataLocation + "clippingReadsTest.bam " + - "-o %s " + - "-ob %s " + args, + "-os %s " + + "-o %s " + args, 2, // just one output file Arrays.asList("tmp", "bam"), Arrays.asList(md51, md52)); @@ -72,7 +72,7 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest { " -I " + validationDataLocation + "originalQuals.chr1.1-1K.bam" + " -L chr1:1-1,000" + " -OQ -QT 4 -CR WRITE_Q0S" + - " -o %s -ob %s", + " -o %s -os %s", 2, Arrays.asList("55c01ccc2e84481b22d3632cdb06c8ba", "22db22749f811d30216215e047461621")); executeTest("clipOriginalQuals", spec);