From b08d63a6b8dc4e7b8f21c528f234d34f81ce654a Mon Sep 17 00:00:00 2001
From: Mark DePristo
Date: Fri, 19 Aug 2011 15:06:37 -0400
Subject: [PATCH] Documentation and code cleanup for ClipReads, CallableLoci,
and VariantsToTable
-- Swapped -o [summary] and -ob [bam] for more standard -o [bam] and -os [summary] arguments.
-- @Advanced arguments
---
.../sting/gatk/walkers/ClipReadsWalker.java | 181 ++++++++++++------
.../walkers/coverage/CallableLociWalker.java | 4 +
.../walkers/variantutils/VariantsToTable.java | 4 +
.../clipreads/ClippingRepresentation.java | 29 ++-
.../ClipReadsWalkersIntegrationTest.java | 6 +-
5 files changed, 154 insertions(+), 70 deletions(-)
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java
index 68afed296..bb65d9b09 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java
@@ -30,7 +30,9 @@ import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.util.StringUtil;
+import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
@@ -42,7 +44,6 @@ import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation;
import org.broadinstitute.sting.utils.clipreads.ReadClipper;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.sam.ReadUtils;
-import org.yaml.snakeyaml.events.SequenceStartEvent;
import java.io.File;
import java.io.PrintStream;
@@ -51,102 +52,158 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
- * This tool provides simple, powerful read clipping capabilities.
+ * This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.
+ *
*
*
- * It allows the user to clip bases in reads
- * with poor quality scores, that match particular sequences, or that were generated by particular machine cycles.
+ * It allows the user to clip bases in reads with poor quality scores, that match particular
+ * sequences, or that were generated by particular machine cycles.
+ *
+ *
+ * - Quality score based clipping
+ * -
+ * Clip bases from the read in clipper from
+ *
argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)
+ * to the end of the read. This is blatantly stolen from BWA.
+ *
+ * Walk through the read from the end (in machine cycle order) to the beginning, calculating the
+ * running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this
+ * sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the
+ * clipping index in the read (from the end).
+ *
+ * - Cycle based clipping
+ * - Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
+ * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions).
+ * For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.
+ *
+ * - Sequence matching
+ * - Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm,
+ * filtering only bases whose sequence exactly matches SEQ.
+ *
+ *
*
*
* Input
*
- * A BAM file containing.
+ * Any number of BAM files.
*
*
* Output
*
- *
- * - -o: a OutputFormatted (recommended BED) file with the callable status covering each base
- * - -summary: a table of callable status x count of all examined bases
- *
+ * A new BAM file containing all of the reads from the input BAMs with the user-specified clipping
+ * operation applied to each read.
+ *
+ *
+ *
Summary output
+ *
+ * Number of examined reads 13
+ * Number of clipped reads 13
+ * Percent of clipped reads 100.00
+ * Number of examined bases 988
+ * Number of clipped bases 126
+ * Percent of clipped bases 12.75
+ * Number of quality-score clipped bases 126
+ * Number of range clipped bases 0
+ * Number of sequence clipped bases 0
+ *
+ *
+ *
+ *
+ *
Example clipping
+ * Suppose we are given this read:
+ *
+ * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
+ * TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ * #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *
+ *
+ * If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:
+ *
+ *
+ * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
+ * NNNNNNNNNNNNNNNNNTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ * #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *
+ *
+ * Whereas with -CR WRITE_Q0S:
+ *
+ * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
+ * TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ * !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *
+ *
+ * Or -CR SOFTCLIP_BASES:
+ *
+ * 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3133 29 17S59M * * *
+ * TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
+ * #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
+ *
*
*
* Examples
*
- * -T CallableLociWalker \
- * -I my.bam \
- * -summary my.summary \
- * -o my.bed
+ * -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
+ * -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
*
- *
- * would produce a BED file (my.bed) that looks like:
- *
- *
- * 20 10000000 10000864 CALLABLE
- * 20 10000865 10000985 POOR_MAPPING_QUALITY
- * 20 10000986 10001138 CALLABLE
- * 20 10001139 10001254 POOR_MAPPING_QUALITY
- * 20 10001255 10012255 CALLABLE
- * 20 10012256 10012259 POOR_MAPPING_QUALITY
- * 20 10012260 10012263 CALLABLE
- * 20 10012264 10012328 POOR_MAPPING_QUALITY
- * 20 10012329 10012550 CALLABLE
- * 20 10012551 10012551 LOW_COVERAGE
- * 20 10012552 10012554 CALLABLE
- * 20 10012555 10012557 LOW_COVERAGE
- * 20 10012558 10012558 CALLABLE
- * et cetera...
- *
- * as well as a summary table that looks like:
- *
- *
- * state nBases
- * REF_N 0
- * CALLABLE 996046
- * NO_COVERAGE 121
- * LOW_COVERAGE 928
- * EXCESSIVE_COVERAGE 0
- * POOR_MAPPING_QUALITY 2906
- *
- *
+
* @author Mark DePristo
- * @since May 7, 2010
+ * @since 2010
*/
@Requires({DataSource.READS})
public class ClipReadsWalker extends ReadWalker {
- @Output
- PrintStream out;
+ /**
+ * If provided, ClipReads will write summary statistics about the clipping operations applied
+ * to the reads to this file.
+ */
+ @Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false)
+ PrintStream out = null;
/**
- * an optional argument to dump the reads out to a BAM file
+ * The output SAM/BAM file will be written here
*/
- @Argument(fullName = "outputBam", shortName = "ob", doc = "Write output to this BAM filename instead of STDOUT", required = false)
- StingSAMFileWriter outputBam = null;
+ @Output(doc = "Write BAM output here", required = true)
+ StingSAMFileWriter outputBam;
- @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "", required = false)
+ /**
+ * If a value > 0 is provided, then the quality score based read clipper will be applied to the reads using this
+ * quality score threshold.
+ */
+ @Argument(fullName = "qTrimmingThreshold", shortName = "QT", doc = "If provided, the Q-score clipper will be applied", required = false)
int qTrimmingThreshold = -1;
- @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String of the form 1-10,20-30 indicating machine cycles to clip from the reads", required = false)
+ /**
+ * Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
+ * For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based
+ * values (positions). For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11,
+ * and 12.
+ */
+ @Argument(fullName = "cyclesToTrim", shortName = "CT", doc = "String indicating machine cycles to clip from the reads", required = false)
String cyclesToClipArg = null;
- @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching these sequences", required = false)
+ /**
+ * Reads the sequences in the provided FASTA file, and clip any bases that exactly match any of the
+ * sequences in the file.
+ */
+ @Argument(fullName = "clipSequencesFile", shortName = "XF", doc = "Remove sequences within reads matching the sequences in this FASTA file", required = false)
String clipSequenceFile = null;
+ /**
+ * Clips bases from the reads matching the provided SEQ. Can be provided any number of times on the command line
+ */
@Argument(fullName = "clipSequence", shortName = "X", doc = "Remove sequences within reads matching this sequence", required = false)
String[] clipSequencesArgs = null;
- @Argument(fullName="read", doc="", required=false)
- String onlyDoRead = null;
-
- //@Argument(fullName = "keepCompletelyClipped", shortName = "KCC", doc = "Unfortunately, sometimes a read is completely clipped away but with SOFTCLIP_BASES this results in an invalid CIGAR string. ", required = false)
- //boolean keepCompletelyClippedReads = false;
-
-// @Argument(fullName = "onlyClipFirstSeqMatch", shortName = "ESC", doc="Only clip the first occurrence of a clipping sequence, rather than all subsequences within a read that match", required = false)
-// boolean onlyClipFirstSeqMatch = false;
-
+ /**
+ * The different values for this argument determines how ClipReads applies clips to the reads. This can range
+ * from writing Ns over the clipped bases to hard clipping away the bases from the BAM.
+ */
@Argument(fullName = "clipRepresentation", shortName = "CR", doc = "How should we actually clip the bases?", required = false)
ClippingRepresentation clippingRepresentation = ClippingRepresentation.WRITE_NS;
+ @Hidden
+ @Advanced
+ @Argument(fullName="read", doc="", required=false)
+ String onlyDoRead = null;
/**
* List of sequence that should be clipped from the reads
diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
index 98331ec1d..32875a098 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java
@@ -22,6 +22,7 @@
package org.broadinstitute.sting.gatk.walkers.coverage;
+import org.broadinstitute.sting.commandline.Advanced;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
@@ -154,6 +155,7 @@ public class CallableLociWalker extends LocusWalker minMappingQuality and with base quality > minBaseQuality) exceeds this
* value and is less than maxDepth the site is considered CALLABLE.
*/
+ @Advanced
@Argument(fullName = "minDepth", shortName = "minDepth", doc = "Minimum QC+ read depth before a locus is considered callable", required = false)
int minDepth = 4;
@@ -168,6 +170,7 @@ public class CallableLociWalker extends LocusWalker {
* By default this tool only emits values for fields where the FILTER field is either PASS or . (unfiltered).
* Throwing this flag will cause $WalkerName to emit values regardless of the FILTER field value.
*/
+ @Advanced
@Argument(fullName="showFiltered", shortName="raw", doc="If provided, field values from filtered records will be included in the output", required=false)
public boolean showFiltered = false;
/**
* If provided, then this tool will exit with success after this number of records have been emitted to the file.
*/
+ @Advanced
@Argument(fullName="maxRecords", shortName="M", doc="If provided, we will emit at most maxRecord records to the table", required=false)
public int MAX_RECORDS = -1;
int nRecords = 0;
@@ -121,6 +123,7 @@ public class VariantsToTable extends RodWalker {
* can make your resulting file unreadable and malformated according to tools like R, as the representation of
* multi-allelic INFO field values can be lists of values.
*/
+ @Advanced
@Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
public boolean keepMultiAllelic = false;
@@ -131,6 +134,7 @@ public class VariantsToTable extends RodWalker {
* fields (e.g., AC not being calculated for filtered records, if included). When provided, this argument
* will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
*/
+ @Advanced
@Argument(fullName="allowMissingData", shortName="AMD", doc="If provided, we will not require every record to contain every field", required=false)
public boolean ALLOW_MISSING_DATA = false;
diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java
index 14c04b5c4..0dbe55726 100644
--- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java
+++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java
@@ -4,9 +4,28 @@ package org.broadinstitute.sting.utils.clipreads;
* How should we represent a clipped bases in a read?
*/
public enum ClippingRepresentation {
- WRITE_NS, // change the bases to Ns
- WRITE_Q0S, // change the quality scores to Q0
- WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
- SOFTCLIP_BASES, // change cigar string to S, but keep bases
- HARDCLIP_BASES // remove the bases from the read
+ /** Clipped bases are changed to Ns */
+ WRITE_NS,
+
+ /** Clipped bases are changed to have Q0 quality score */
+ WRITE_Q0S,
+
+ /** Clipped bases are change to have both an N base and a Q0 quality score */
+ WRITE_NS_Q0S,
+
+ /**
+ * Change the read's cigar string to soft clip (S, see sam-spec) away the bases.
+ * Note that this can only be applied to cases where the clipped bases occur
+ * at the start or end of a read.
+ */
+ SOFTCLIP_BASES,
+
+ /**
+ * Change the read's cigar string to hard clip (H, see sam-spec) away the bases.
+ * Hard clipping, unlike soft clipping, actually removes bases from the read,
+ * reducing the resulting file's size but introducing an irrevesible (i.e.,
+ * lossy) operation. Note that this can only be applied to cases where the clipped
+ * bases occur at the start or end of a read.
+ */
+ HARDCLIP_BASES
}
diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java
index a129f8adf..ca3d1ee25 100755
--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/ClipReadsWalkersIntegrationTest.java
@@ -37,8 +37,8 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
"-R " + hg18Reference +
" -T ClipReads " +
"-I " + validationDataLocation + "clippingReadsTest.bam " +
- "-o %s " +
- "-ob %s " + args,
+ "-os %s " +
+ "-o %s " + args,
2, // just one output file
Arrays.asList("tmp", "bam"),
Arrays.asList(md51, md52));
@@ -72,7 +72,7 @@ public class ClipReadsWalkersIntegrationTest extends WalkerTest {
" -I " + validationDataLocation + "originalQuals.chr1.1-1K.bam" +
" -L chr1:1-1,000" +
" -OQ -QT 4 -CR WRITE_Q0S" +
- " -o %s -ob %s",
+ " -o %s -os %s",
2,
Arrays.asList("55c01ccc2e84481b22d3632cdb06c8ba", "22db22749f811d30216215e047461621"));
executeTest("clipOriginalQuals", spec);