From 136c8c79005ef5c53c24dab5dd376b96fb1679e5 Mon Sep 17 00:00:00 2001 From: depristo Date: Fri, 27 May 2011 11:42:22 +0000 Subject: [PATCH] ClipReads now supports HARDCLIP_BASES, though in fact this turned out to be not necessary for my desired tests. In the process of developing the HARDCLIP mode, I added some proper ReadUtils unit tests, which would ideally be expanded to include other ReadUtil functions, as added git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5890 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/walkers/ClipReadsWalker.java | 15 +++- .../sting/utils/clipreads/ClippingOp.java | 28 +++---- .../clipreads/ClippingRepresentation.java | 3 +- .../sting/utils/clipreads/ReadClipper.java | 2 +- .../sting/utils/sam/ReadUtils.java | 82 ++++++++++++++++--- .../sting/utils/ReadUtilsUnitTest.java | 41 ++++++++++ 6 files changed, 139 insertions(+), 32 deletions(-) create mode 100755 java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index 91eef6f34..1a3f87a7a 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -47,6 +47,7 @@ import java.io.File; import java.io.PrintStream; import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.utils.sam.ReadUtils; /** * This ReadWalker provides simple, yet powerful read clipping capabilities. It allows the user to clip bases in reads @@ -154,8 +155,10 @@ public class ClipReadsWalker extends ReadWalker presorted = EnumSet.of(ClippingRepresentation.WRITE_NS, ClippingRepresentation.WRITE_NS_Q0S, ClippingRepresentation.WRITE_Q0S); + outputBam.setPresorted(presorted.contains(clippingRepresentation)); + } } /** @@ -179,6 +182,9 @@ public class ClipReadsWalker extends ReadWalker %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); + } else if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) { + // we can hard clip unmapped reads + if ( clippedRead.getReadNegativeStrandFlag() ) + clippedRead = ReadUtils.hardClipBases(clippedRead, 0, start, null); + else + clippedRead = ReadUtils.hardClipBases(clippedRead, start, start + getLength(), null); } - break; - //throw new RuntimeException("Softclipping of bases not yet implemented."); default: throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); } + + return clippedRead; } /** diff --git a/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java b/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java index 3a35244e5..14c04b5c4 100644 --- a/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java +++ b/java/src/org/broadinstitute/sting/utils/clipreads/ClippingRepresentation.java @@ -7,5 +7,6 @@ public enum ClippingRepresentation { WRITE_NS, // change the bases to Ns WRITE_Q0S, // change the quality scores to Q0 WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns - SOFTCLIP_BASES // change cigar string to S, but keep bases + SOFTCLIP_BASES, // change cigar string to S, but keep bases + HARDCLIP_BASES // remove the bases from the read } diff --git a/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 977074b38..031467ed9 100644 --- a/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -58,7 +58,7 @@ public class ReadClipper { try { SAMRecord clippedRead = (SAMRecord) read.clone(); for (ClippingOp op : getOps()) { - op.apply(algorithm, clippedRead); + clippedRead = op.apply(algorithm, clippedRead); } return clippedRead; } catch (CloneNotSupportedException e) { diff --git a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index e30584113..2394e34dc 100644 --- a/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.sam; +import com.google.java.contract.*; import net.sf.samtools.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -370,23 +371,26 @@ public class ReadUtils { * @param rec * @return */ + @Requires("rec != null") + @Ensures("result != null") public static SAMRecord hardClipSoftClippedBases(SAMRecord rec) { List cigarElts = rec.getCigar().getCigarElements(); if ( cigarElts.size() == 1 ) // can't be soft clipped, just return return rec; - int basesStart = 0; + int keepStart = 0, keepEnd = rec.getReadLength() - 1; List newCigarElements = new LinkedList(); - int basesToClip = 0; for ( int i = 0; i < cigarElts.size(); i++ ) { CigarElement ce = cigarElts.get(i); int l = ce.getLength(); switch ( ce.getOperator() ) { case S: - basesToClip += l; - if ( i == 0 ) basesStart += l; + if ( i == 0 ) + keepStart = l; + else + keepEnd = rec.getReadLength() - l - 1; newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP)); break; case H: @@ -398,23 +402,46 @@ public class ReadUtils { } } - if ( basesToClip > 0 ) { + return hardClipBases(rec, keepStart, keepEnd, newCigarElements); + } + + /** + * Hard clips out the bases in rec, keeping the bases from keepStart to keepEnd, inclusive. Note these + * are offsets, so they are 0 based + * + * @param rec + * @param keepStart + * @param keepEnd + * @param newCigarElements + * @return + */ + @Requires({ + "rec != null", + "keepStart >= 0", + "keepEnd < rec.getReadLength()", + "rec.getReadUnmappedFlag() || newCigarElements != null"}) + @Ensures("result != null") + public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, + List newCigarElements) { + int newLength = keepEnd - keepStart + 1; + if ( newLength != rec.getReadLength() ) { try { rec = SimplifyingSAMFileWriter.simplifyRead((SAMRecord)rec.clone()); // copy over the unclipped bases final byte[] bases = rec.getReadBases(); final byte[] quals = rec.getBaseQualities(); - int newLength = bases.length - basesToClip; byte[] newBases = new byte[newLength]; byte[] newQuals = new byte[newLength]; - System.arraycopy(bases, basesStart, newBases, 0, newLength); - System.arraycopy(quals, basesStart, newQuals, 0, newLength); + System.arraycopy(bases, keepStart, newBases, 0, newLength); + System.arraycopy(quals, keepStart, newQuals, 0, newLength); rec.setReadBases(newBases); rec.setBaseQualities(newQuals); - // now add a CIGAR element for the clipped bases - Cigar newCigar = new Cigar(newCigarElements); - rec.setCigar(newCigar); + // now add a CIGAR element for the clipped bases, if the read isn't unmapped + if ( ! rec.getReadUnmappedFlag() ) { + Cigar newCigar = new Cigar(newCigarElements); + rec.setCigar(newCigar); + } } catch ( CloneNotSupportedException e ) { throw new ReviewedStingException("WTF, where did clone go?", e); } @@ -423,6 +450,39 @@ public class ReadUtils { return rec; } + public static SAMRecord replaceSoftClipsWithMatches(SAMRecord read) { + List newCigarElements = new ArrayList(); + + for ( CigarElement ce : read.getCigar().getCigarElements() ) { + if ( ce.getOperator() == CigarOperator.SOFT_CLIP ) + newCigarElements.add(new CigarElement(ce.getLength(), CigarOperator.MATCH_OR_MISMATCH)); + else + newCigarElements.add(ce); + } + + if ( newCigarElements.size() > 1 ) { // + CigarElement first = newCigarElements.get(0); + CigarElement second = newCigarElements.get(1); + if ( first.getOperator() == CigarOperator.MATCH_OR_MISMATCH && second.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) { + newCigarElements.set(0, new CigarElement(first.getLength() + second.getLength(), CigarOperator.MATCH_OR_MISMATCH)); + newCigarElements.remove(1); + } + } + + if ( newCigarElements.size() > 1 ) { // + CigarElement penult = newCigarElements.get(newCigarElements.size()-2); + CigarElement last = newCigarElements.get(newCigarElements.size()-1); + if ( penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH && penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) { + newCigarElements.set(newCigarElements.size()-2, new CigarElement(penult.getLength() + last.getLength(), CigarOperator.MATCH_OR_MISMATCH)); + newCigarElements.remove(newCigarElements.size()-1); + } + } + + read.setCigar(new Cigar(newCigarElements)); + return read; + } + + private static int DEFAULT_ADAPTOR_SIZE = 100; /** diff --git a/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java b/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java new file mode 100755 index 000000000..7cb7fec98 --- /dev/null +++ b/java/test/org/broadinstitute/sting/utils/ReadUtilsUnitTest.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.utils; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + + +public class ReadUtilsUnitTest extends BaseTest { + SAMRecord read; + final static String BASES = "ACTG"; + final static String QUALS = "!+5?"; + + @BeforeTest + public void init() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000); + read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); + read.setReadUnmappedFlag(true); + read.setReadBases(new String(BASES).getBytes()); + read.setBaseQualityString(new String(QUALS)); + } + + private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) { + SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null); + String expectedBases = BASES.substring(expectedStart, expectedStop); + String expectedQuals = QUALS.substring(expectedStart, expectedStop); + Assert.assertEquals(clipped.getReadBases(), expectedBases.getBytes(), "Clipped bases not those expected"); + Assert.assertEquals(clipped.getBaseQualityString(), expectedQuals, "Clipped quals not those expected"); + } + + @Test public void testNoClip() { testReadBasesAndQuals(read, 0, 4); } + @Test public void testClip1Front() { testReadBasesAndQuals(read, 1, 4); } + @Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); } + @Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); } + @Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); } +}