Merge pull request #390 from broadinstitute/mc_update_clipreads
Added REVERT SOFTCLIPPED bases to ClipReads
This commit is contained in:
commit
292426b504
|
|
@ -57,36 +57,34 @@ import java.util.regex.Matcher;
|
|||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* This tool provides simple, powerful read clipping capabilities to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.
|
||||
* Read clipping based on quality, position or sequence matching
|
||||
*
|
||||
* <p>This tool provides simple, powerful read clipping capabilities that allow you to remove low quality strings of bases, sections of reads, and reads containing user-provided sequences.</p>
|
||||
*
|
||||
* <p>
|
||||
* It allows the user to clip bases in reads with poor quality scores, that match particular
|
||||
* sequences, or that were generated by particular machine cycles.
|
||||
* <p>There are three options for clipping (quality, position and sequence), which can be used alone or in combination. In addition, you can also specify a clipping representation, which determines exactly how ClipReads applies clips to the reads (soft clips, writing Q0 base quality scores, etc.). Please note that you MUST specify at least one of the three clipping options, and specifying a clipping representation is not sufficient. If you do not specify a clipping option, the program will run but it will not do anything to your reads.</p>
|
||||
*
|
||||
* <dl>
|
||||
* <dt>Quality score based clipping</dt>
|
||||
* <dd>
|
||||
* Clip bases from the read in clipper from
|
||||
* <br>argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)</br>
|
||||
* to the end of the read. This is blatantly stolen from BWA.
|
||||
* <pre>argmax_x{ \sum{i = x + 1}^l (qTrimmingThreshold - qual)</pre>
|
||||
* to the end of the read. This is copied from BWA.
|
||||
*
|
||||
* Walk through the read from the end (in machine cycle order) to the beginning, calculating the
|
||||
* running sum of qTrimmingThreshold - qual. While we do this, we track the maximum value of this
|
||||
* sum where the delta > 0. After the loop, clipPoint is either -1 (don't do anything) or the
|
||||
* clipping index in the read (from the end).
|
||||
* </dd>
|
||||
* </dd><br />
|
||||
* <dt>Cycle based clipping</dt>
|
||||
* <dd>Clips machine cycles from the read. Accepts a string of ranges of the form start1-end1,start2-end2, etc.
|
||||
* For each start/end pair, removes bases in machine cycles from start to end, inclusive. These are 1-based values (positions).
|
||||
* For example, 1-5,10-12 clips the first 5 bases, and then three bases at cycles 10, 11, and 12.
|
||||
* </dd>
|
||||
* </dd><br />
|
||||
* <dt>Sequence matching</dt>
|
||||
* <dd>Clips bases from that exactly match one of a number of base sequences. This employs an exact match algorithm,
|
||||
* filtering only bases whose sequence exactly matches SEQ.</dd>
|
||||
* </dl>
|
||||
*
|
||||
* </p>
|
||||
*
|
||||
* <h3>Input</h3>
|
||||
* <p>
|
||||
|
|
@ -99,7 +97,7 @@ import java.util.regex.Pattern;
|
|||
* operation applied to each read.
|
||||
* </p>
|
||||
* <p>
|
||||
* <h3>Summary output</h3>
|
||||
* <h4>Summary output (console)</h4>
|
||||
* <pre>
|
||||
* Number of examined reads 13
|
||||
* Number of clipped reads 13
|
||||
|
|
@ -113,16 +111,29 @@ import java.util.regex.Pattern;
|
|||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* <h3>Example clipping</h3>
|
||||
* Suppose we are given this read:
|
||||
* <h3>Example</h3>
|
||||
* <pre>
|
||||
* java -jar GenomeAnalysisTK.jar \
|
||||
* -T ClipReads \
|
||||
* -R reference.fasta \
|
||||
* -I original.bam \
|
||||
* -o clipped.bam \
|
||||
* -XF seqsToClip.fasta \
|
||||
* -X CCCCC \
|
||||
* -CT "1-5,11-15" \
|
||||
* -QT 10
|
||||
* </pre>
|
||||
* <p>The command line shown above will apply all three options in combination. See the detailed examples below to see how the choice of clipping representation affects the output.</p>
|
||||
*
|
||||
* <h4>Detailed clipping examples</h4>
|
||||
* <p>Suppose we are given this read:</p>
|
||||
* <pre>
|
||||
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
|
||||
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
|
||||
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
|
||||
* </pre>
|
||||
*
|
||||
* If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:
|
||||
* <p>If we are clipping reads with -QT 10 and -CR WRITE_NS, we get:</p>
|
||||
*
|
||||
* <pre>
|
||||
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
|
||||
|
|
@ -130,26 +141,20 @@ import java.util.regex.Pattern;
|
|||
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
|
||||
* </pre>
|
||||
*
|
||||
* Whereas with -CR WRITE_Q0S:
|
||||
* <p>Whereas with -QT 10 -CR WRITE_Q0S:</p>
|
||||
* <pre>
|
||||
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3116 29 76M * * *
|
||||
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
|
||||
* !!!!!!!!!!!!!!!!!4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
|
||||
* </pre>
|
||||
*
|
||||
* Or -CR SOFTCLIP_BASES:
|
||||
* <p>Or -QT 10 -CR SOFTCLIP_BASES:</p>
|
||||
* <pre>
|
||||
* 314KGAAXX090507:1:19:1420:1123#0 16 chrM 3133 29 17S59M * * *
|
||||
* TAGGACCCGGGCCCCCCTCCCCAATCCTCCAACGCATATAGCGGCCGCGCCTTCCCCCGTAAATGATATCATCTCA
|
||||
* #################4?6/?2135;;;'1/=/<'B9;12;68?A79@,@==@9?=AAA3;A@B;A?B54;?ABA
|
||||
* </pre>
|
||||
* </p>
|
||||
*
|
||||
* <h3>Examples</h3>
|
||||
* <pre>
|
||||
* -T ClipReads -I my.bam -I your.bam -o my_and_your.clipped.bam -R Homo_sapiens_assembly18.fasta \
|
||||
* -XF seqsToClip.fasta -X CCCCC -CT "1-5,11-15" -QT 10
|
||||
* </pre>
|
||||
|
||||
* @author Mark DePristo
|
||||
* @since 2010
|
||||
|
|
@ -158,10 +163,9 @@ import java.util.regex.Pattern;
|
|||
@Requires({DataSource.READS})
|
||||
public class ClipReads extends ReadWalker<ClipReads.ReadClipperWithData, ClipReads.ClippingData> {
|
||||
/**
|
||||
* If provided, ClipReads will write summary statistics about the clipping operations applied
|
||||
* to the reads to this file.
|
||||
* If provided, ClipReads will write summary statistics about the clipping operations applied to the reads in this file.
|
||||
*/
|
||||
@Output(fullName = "outputStatistics", shortName = "os", doc = "Write output statistics to this file", required = false, defaultToStdout = false)
|
||||
@Output(fullName = "outputStatistics", shortName = "os", doc = "File to output statistics", required = false, defaultToStdout = false)
|
||||
PrintStream out = null;
|
||||
|
||||
/**
|
||||
|
|
@ -305,7 +309,7 @@ public class ClipReads extends ReadWalker<ClipReads.ReadClipperWithData, ClipRea
|
|||
*/
|
||||
public ReadClipperWithData map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
|
||||
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES )
|
||||
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES || clippingRepresentation == ClippingRepresentation.REVERT_SOFTCLIPPED_BASES )
|
||||
read = ReadClipper.revertSoftClippedBases(read);
|
||||
ReadClipperWithData clipper = new ReadClipperWithData(read, sequencesToClip);
|
||||
|
||||
|
|
@ -600,4 +604,4 @@ public class ClipReads extends ReadWalker<ClipReads.ReadClipperWithData, ClipRea
|
|||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue