ClipReads now supports HARDCLIP_BASES, though in fact this turned out to be not necessary for my desired tests. In the process of developing the HARDCLIP mode, I added some proper ReadUtils unit tests, which would ideally be expanded to include other ReadUtil functions, as added

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5890 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-05-27 11:42:22 +00:00
parent 549172af10
commit 136c8c7900
6 changed files with 139 additions and 32 deletions

View File

@ -47,6 +47,7 @@ import java.io.File;
import java.io.PrintStream;
import net.sf.samtools.util.StringUtil;
import org.broadinstitute.sting.utils.sam.ReadUtils;
/**
* This ReadWalker provides simple, yet powerful read clipping capabilities. It allows the user to clip bases in reads
@ -154,8 +155,10 @@ public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.Cli
}
}
if (outputBam != null)
outputBam.setPresorted(clippingRepresentation != ClippingRepresentation.SOFTCLIP_BASES);
if (outputBam != null) {
EnumSet<ClippingRepresentation> presorted = EnumSet.of(ClippingRepresentation.WRITE_NS, ClippingRepresentation.WRITE_NS_Q0S, ClippingRepresentation.WRITE_Q0S);
outputBam.setPresorted(presorted.contains(clippingRepresentation));
}
}
/**
@ -179,6 +182,9 @@ public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.Cli
*/
public ReadClipper map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES ) {
read = ReadUtils.replaceSoftClipsWithMatches(read);
}
ReadClipper clipper = new ReadClipper(read);
//
@ -323,10 +329,11 @@ public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.Cli
if ( clipper == null )
return data;
SAMRecord clippedRead = clipper.clipRead(clippingRepresentation);
if (outputBam != null) {
outputBam.addAlignment(clipper.clipRead(clippingRepresentation));
outputBam.addAlignment(clippedRead);
} else {
out.println(clipper.clipRead(clippingRepresentation).format());
out.println(clippedRead.format());
}
data.nTotalReads++;

View File

@ -7,6 +7,7 @@ import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.walkers.ClipReadsWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.Vector;
@ -45,7 +46,7 @@ public class ClippingOp {
* @param algorithm
* @param clippedRead
*/
public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
//clippedRead.setReferenceIndex(1);
byte[] quals = clippedRead.getBaseQualities();
byte[] bases = clippedRead.getReadBases();
@ -72,6 +73,7 @@ public class ClippingOp {
clippedRead.setReadBases(bases);
clippedRead.setBaseQualities(quals);
break;
case HARDCLIP_BASES:
case SOFTCLIP_BASES:
if ( ! clippedRead.getReadUnmappedFlag() ) {
// we can't process unmapped reads
@ -97,18 +99,6 @@ public class ClippingOp {
else
scRight = start;
// if ( clippedRead.getReadNegativeStrandFlag() ) {
// if ( start == 0 )
// scLeft = myStop + 1;
// else
// scRight = start;
// } else {
// if ( start == 0 )
// scLeft = myStop;
// else
// scRight = start;
// }
Cigar newCigar = softClip(oldCigar, scLeft, scRight);
clippedRead.setCigar(newCigar);
@ -116,14 +106,22 @@ public class ClippingOp {
int newStart = clippedRead.getAlignmentStart() + newClippedStart;
clippedRead.setAlignmentStart(newStart);
if ( algorithm == ClippingRepresentation.HARDCLIP_BASES )
clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead);
//System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart);
} else if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) {
// we can hard clip unmapped reads
if ( clippedRead.getReadNegativeStrandFlag() )
clippedRead = ReadUtils.hardClipBases(clippedRead, 0, start, null);
else
clippedRead = ReadUtils.hardClipBases(clippedRead, start, start + getLength(), null);
}
break;
//throw new RuntimeException("Softclipping of bases not yet implemented.");
default:
throw new IllegalStateException("Unexpected Clipping operator type " + algorithm);
}
return clippedRead;
}
/**

View File

@ -7,5 +7,6 @@ public enum ClippingRepresentation {
WRITE_NS, // change the bases to Ns
WRITE_Q0S, // change the quality scores to Q0
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
SOFTCLIP_BASES // change cigar string to S, but keep bases
SOFTCLIP_BASES, // change cigar string to S, but keep bases
HARDCLIP_BASES // remove the bases from the read
}

View File

@ -58,7 +58,7 @@ public class ReadClipper {
try {
SAMRecord clippedRead = (SAMRecord) read.clone();
for (ClippingOp op : getOps()) {
op.apply(algorithm, clippedRead);
clippedRead = op.apply(algorithm, clippedRead);
}
return clippedRead;
} catch (CloneNotSupportedException e) {

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.utils.sam;
import com.google.java.contract.*;
import net.sf.samtools.*;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -370,23 +371,26 @@ public class ReadUtils {
* @param rec
* @return
*/
@Requires("rec != null")
@Ensures("result != null")
public static SAMRecord hardClipSoftClippedBases(SAMRecord rec) {
List<CigarElement> cigarElts = rec.getCigar().getCigarElements();
if ( cigarElts.size() == 1 ) // can't be soft clipped, just return
return rec;
int basesStart = 0;
int keepStart = 0, keepEnd = rec.getReadLength() - 1;
List<CigarElement> newCigarElements = new LinkedList<CigarElement>();
int basesToClip = 0;
for ( int i = 0; i < cigarElts.size(); i++ ) {
CigarElement ce = cigarElts.get(i);
int l = ce.getLength();
switch ( ce.getOperator() ) {
case S:
basesToClip += l;
if ( i == 0 ) basesStart += l;
if ( i == 0 )
keepStart = l;
else
keepEnd = rec.getReadLength() - l - 1;
newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP));
break;
case H:
@ -398,23 +402,46 @@ public class ReadUtils {
}
}
if ( basesToClip > 0 ) {
return hardClipBases(rec, keepStart, keepEnd, newCigarElements);
}
/**
* Hard clips out the bases in rec, keeping the bases from keepStart to keepEnd, inclusive. Note these
* are offsets, so they are 0 based
*
* @param rec
* @param keepStart
* @param keepEnd
* @param newCigarElements
* @return
*/
@Requires({
"rec != null",
"keepStart >= 0",
"keepEnd < rec.getReadLength()",
"rec.getReadUnmappedFlag() || newCigarElements != null"})
@Ensures("result != null")
public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd,
List<CigarElement> newCigarElements) {
int newLength = keepEnd - keepStart + 1;
if ( newLength != rec.getReadLength() ) {
try {
rec = SimplifyingSAMFileWriter.simplifyRead((SAMRecord)rec.clone());
// copy over the unclipped bases
final byte[] bases = rec.getReadBases();
final byte[] quals = rec.getBaseQualities();
int newLength = bases.length - basesToClip;
byte[] newBases = new byte[newLength];
byte[] newQuals = new byte[newLength];
System.arraycopy(bases, basesStart, newBases, 0, newLength);
System.arraycopy(quals, basesStart, newQuals, 0, newLength);
System.arraycopy(bases, keepStart, newBases, 0, newLength);
System.arraycopy(quals, keepStart, newQuals, 0, newLength);
rec.setReadBases(newBases);
rec.setBaseQualities(newQuals);
// now add a CIGAR element for the clipped bases
Cigar newCigar = new Cigar(newCigarElements);
rec.setCigar(newCigar);
// now add a CIGAR element for the clipped bases, if the read isn't unmapped
if ( ! rec.getReadUnmappedFlag() ) {
Cigar newCigar = new Cigar(newCigarElements);
rec.setCigar(newCigar);
}
} catch ( CloneNotSupportedException e ) {
throw new ReviewedStingException("WTF, where did clone go?", e);
}
@ -423,6 +450,39 @@ public class ReadUtils {
return rec;
}
public static SAMRecord replaceSoftClipsWithMatches(SAMRecord read) {
List<CigarElement> newCigarElements = new ArrayList<CigarElement>();
for ( CigarElement ce : read.getCigar().getCigarElements() ) {
if ( ce.getOperator() == CigarOperator.SOFT_CLIP )
newCigarElements.add(new CigarElement(ce.getLength(), CigarOperator.MATCH_OR_MISMATCH));
else
newCigarElements.add(ce);
}
if ( newCigarElements.size() > 1 ) { //
CigarElement first = newCigarElements.get(0);
CigarElement second = newCigarElements.get(1);
if ( first.getOperator() == CigarOperator.MATCH_OR_MISMATCH && second.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) {
newCigarElements.set(0, new CigarElement(first.getLength() + second.getLength(), CigarOperator.MATCH_OR_MISMATCH));
newCigarElements.remove(1);
}
}
if ( newCigarElements.size() > 1 ) { //
CigarElement penult = newCigarElements.get(newCigarElements.size()-2);
CigarElement last = newCigarElements.get(newCigarElements.size()-1);
if ( penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH && penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) {
newCigarElements.set(newCigarElements.size()-2, new CigarElement(penult.getLength() + last.getLength(), CigarOperator.MATCH_OR_MISMATCH));
newCigarElements.remove(newCigarElements.size()-1);
}
}
read.setCigar(new Cigar(newCigarElements));
return read;
}
private static int DEFAULT_ADAPTOR_SIZE = 100;
/**

View File

@ -0,0 +1,41 @@
package org.broadinstitute.sting.utils;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.testng.Assert;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;
public class ReadUtilsUnitTest extends BaseTest {
SAMRecord read;
final static String BASES = "ACTG";
final static String QUALS = "!+5?";
@BeforeTest
public void init() {
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000);
read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length());
read.setReadUnmappedFlag(true);
read.setReadBases(new String(BASES).getBytes());
read.setBaseQualityString(new String(QUALS));
}
private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) {
SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null);
String expectedBases = BASES.substring(expectedStart, expectedStop);
String expectedQuals = QUALS.substring(expectedStart, expectedStop);
Assert.assertEquals(clipped.getReadBases(), expectedBases.getBytes(), "Clipped bases not those expected");
Assert.assertEquals(clipped.getBaseQualityString(), expectedQuals, "Clipped quals not those expected");
}
@Test public void testNoClip() { testReadBasesAndQuals(read, 0, 4); }
@Test public void testClip1Front() { testReadBasesAndQuals(read, 1, 4); }
@Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); }
@Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); }
@Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); }
}