ClipReads now supports HARDCLIP_BASES, though in fact this turned out to be not necessary for my desired tests. In the process of developing the HARDCLIP mode, I added some proper ReadUtils unit tests, which would ideally be expanded to include other ReadUtil functions, as added
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5890 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
549172af10
commit
136c8c7900
|
|
@ -47,6 +47,7 @@ import java.io.File;
|
|||
import java.io.PrintStream;
|
||||
|
||||
import net.sf.samtools.util.StringUtil;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
/**
|
||||
* This ReadWalker provides simple, yet powerful read clipping capabilities. It allows the user to clip bases in reads
|
||||
|
|
@ -154,8 +155,10 @@ public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.Cli
|
|||
}
|
||||
}
|
||||
|
||||
if (outputBam != null)
|
||||
outputBam.setPresorted(clippingRepresentation != ClippingRepresentation.SOFTCLIP_BASES);
|
||||
if (outputBam != null) {
|
||||
EnumSet<ClippingRepresentation> presorted = EnumSet.of(ClippingRepresentation.WRITE_NS, ClippingRepresentation.WRITE_NS_Q0S, ClippingRepresentation.WRITE_Q0S);
|
||||
outputBam.setPresorted(presorted.contains(clippingRepresentation));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -179,6 +182,9 @@ public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.Cli
|
|||
*/
|
||||
public ReadClipper map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
|
||||
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES ) {
|
||||
read = ReadUtils.replaceSoftClipsWithMatches(read);
|
||||
}
|
||||
ReadClipper clipper = new ReadClipper(read);
|
||||
|
||||
//
|
||||
|
|
@ -323,10 +329,11 @@ public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.Cli
|
|||
if ( clipper == null )
|
||||
return data;
|
||||
|
||||
SAMRecord clippedRead = clipper.clipRead(clippingRepresentation);
|
||||
if (outputBam != null) {
|
||||
outputBam.addAlignment(clipper.clipRead(clippingRepresentation));
|
||||
outputBam.addAlignment(clippedRead);
|
||||
} else {
|
||||
out.println(clipper.clipRead(clippingRepresentation).format());
|
||||
out.println(clippedRead.format());
|
||||
}
|
||||
|
||||
data.nTotalReads++;
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ import net.sf.samtools.SAMRecord;
|
|||
import org.broadinstitute.sting.gatk.walkers.ClipReadsWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.Vector;
|
||||
|
||||
|
|
@ -45,7 +46,7 @@ public class ClippingOp {
|
|||
* @param algorithm
|
||||
* @param clippedRead
|
||||
*/
|
||||
public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
|
||||
public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
|
||||
//clippedRead.setReferenceIndex(1);
|
||||
byte[] quals = clippedRead.getBaseQualities();
|
||||
byte[] bases = clippedRead.getReadBases();
|
||||
|
|
@ -72,6 +73,7 @@ public class ClippingOp {
|
|||
clippedRead.setReadBases(bases);
|
||||
clippedRead.setBaseQualities(quals);
|
||||
break;
|
||||
case HARDCLIP_BASES:
|
||||
case SOFTCLIP_BASES:
|
||||
if ( ! clippedRead.getReadUnmappedFlag() ) {
|
||||
// we can't process unmapped reads
|
||||
|
|
@ -97,18 +99,6 @@ public class ClippingOp {
|
|||
else
|
||||
scRight = start;
|
||||
|
||||
// if ( clippedRead.getReadNegativeStrandFlag() ) {
|
||||
// if ( start == 0 )
|
||||
// scLeft = myStop + 1;
|
||||
// else
|
||||
// scRight = start;
|
||||
// } else {
|
||||
// if ( start == 0 )
|
||||
// scLeft = myStop;
|
||||
// else
|
||||
// scRight = start;
|
||||
// }
|
||||
|
||||
Cigar newCigar = softClip(oldCigar, scLeft, scRight);
|
||||
clippedRead.setCigar(newCigar);
|
||||
|
||||
|
|
@ -116,14 +106,22 @@ public class ClippingOp {
|
|||
int newStart = clippedRead.getAlignmentStart() + newClippedStart;
|
||||
clippedRead.setAlignmentStart(newStart);
|
||||
|
||||
if ( algorithm == ClippingRepresentation.HARDCLIP_BASES )
|
||||
clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead);
|
||||
//System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart);
|
||||
} else if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) {
|
||||
// we can hard clip unmapped reads
|
||||
if ( clippedRead.getReadNegativeStrandFlag() )
|
||||
clippedRead = ReadUtils.hardClipBases(clippedRead, 0, start, null);
|
||||
else
|
||||
clippedRead = ReadUtils.hardClipBases(clippedRead, start, start + getLength(), null);
|
||||
}
|
||||
|
||||
break;
|
||||
//throw new RuntimeException("Softclipping of bases not yet implemented.");
|
||||
default:
|
||||
throw new IllegalStateException("Unexpected Clipping operator type " + algorithm);
|
||||
}
|
||||
|
||||
return clippedRead;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -7,5 +7,6 @@ public enum ClippingRepresentation {
|
|||
WRITE_NS, // change the bases to Ns
|
||||
WRITE_Q0S, // change the quality scores to Q0
|
||||
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
|
||||
SOFTCLIP_BASES // change cigar string to S, but keep bases
|
||||
SOFTCLIP_BASES, // change cigar string to S, but keep bases
|
||||
HARDCLIP_BASES // remove the bases from the read
|
||||
}
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ public class ReadClipper {
|
|||
try {
|
||||
SAMRecord clippedRead = (SAMRecord) read.clone();
|
||||
for (ClippingOp op : getOps()) {
|
||||
op.apply(algorithm, clippedRead);
|
||||
clippedRead = op.apply(algorithm, clippedRead);
|
||||
}
|
||||
return clippedRead;
|
||||
} catch (CloneNotSupportedException e) {
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
|
||||
package org.broadinstitute.sting.utils.sam;
|
||||
|
||||
import com.google.java.contract.*;
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -370,23 +371,26 @@ public class ReadUtils {
|
|||
* @param rec
|
||||
* @return
|
||||
*/
|
||||
@Requires("rec != null")
|
||||
@Ensures("result != null")
|
||||
public static SAMRecord hardClipSoftClippedBases(SAMRecord rec) {
|
||||
List<CigarElement> cigarElts = rec.getCigar().getCigarElements();
|
||||
|
||||
if ( cigarElts.size() == 1 ) // can't be soft clipped, just return
|
||||
return rec;
|
||||
|
||||
int basesStart = 0;
|
||||
int keepStart = 0, keepEnd = rec.getReadLength() - 1;
|
||||
List<CigarElement> newCigarElements = new LinkedList<CigarElement>();
|
||||
int basesToClip = 0;
|
||||
|
||||
for ( int i = 0; i < cigarElts.size(); i++ ) {
|
||||
CigarElement ce = cigarElts.get(i);
|
||||
int l = ce.getLength();
|
||||
switch ( ce.getOperator() ) {
|
||||
case S:
|
||||
basesToClip += l;
|
||||
if ( i == 0 ) basesStart += l;
|
||||
if ( i == 0 )
|
||||
keepStart = l;
|
||||
else
|
||||
keepEnd = rec.getReadLength() - l - 1;
|
||||
newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP));
|
||||
break;
|
||||
case H:
|
||||
|
|
@ -398,23 +402,46 @@ public class ReadUtils {
|
|||
}
|
||||
}
|
||||
|
||||
if ( basesToClip > 0 ) {
|
||||
return hardClipBases(rec, keepStart, keepEnd, newCigarElements);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hard clips out the bases in rec, keeping the bases from keepStart to keepEnd, inclusive. Note these
|
||||
* are offsets, so they are 0 based
|
||||
*
|
||||
* @param rec
|
||||
* @param keepStart
|
||||
* @param keepEnd
|
||||
* @param newCigarElements
|
||||
* @return
|
||||
*/
|
||||
@Requires({
|
||||
"rec != null",
|
||||
"keepStart >= 0",
|
||||
"keepEnd < rec.getReadLength()",
|
||||
"rec.getReadUnmappedFlag() || newCigarElements != null"})
|
||||
@Ensures("result != null")
|
||||
public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd,
|
||||
List<CigarElement> newCigarElements) {
|
||||
int newLength = keepEnd - keepStart + 1;
|
||||
if ( newLength != rec.getReadLength() ) {
|
||||
try {
|
||||
rec = SimplifyingSAMFileWriter.simplifyRead((SAMRecord)rec.clone());
|
||||
// copy over the unclipped bases
|
||||
final byte[] bases = rec.getReadBases();
|
||||
final byte[] quals = rec.getBaseQualities();
|
||||
int newLength = bases.length - basesToClip;
|
||||
byte[] newBases = new byte[newLength];
|
||||
byte[] newQuals = new byte[newLength];
|
||||
System.arraycopy(bases, basesStart, newBases, 0, newLength);
|
||||
System.arraycopy(quals, basesStart, newQuals, 0, newLength);
|
||||
System.arraycopy(bases, keepStart, newBases, 0, newLength);
|
||||
System.arraycopy(quals, keepStart, newQuals, 0, newLength);
|
||||
rec.setReadBases(newBases);
|
||||
rec.setBaseQualities(newQuals);
|
||||
|
||||
// now add a CIGAR element for the clipped bases
|
||||
Cigar newCigar = new Cigar(newCigarElements);
|
||||
rec.setCigar(newCigar);
|
||||
// now add a CIGAR element for the clipped bases, if the read isn't unmapped
|
||||
if ( ! rec.getReadUnmappedFlag() ) {
|
||||
Cigar newCigar = new Cigar(newCigarElements);
|
||||
rec.setCigar(newCigar);
|
||||
}
|
||||
} catch ( CloneNotSupportedException e ) {
|
||||
throw new ReviewedStingException("WTF, where did clone go?", e);
|
||||
}
|
||||
|
|
@ -423,6 +450,39 @@ public class ReadUtils {
|
|||
return rec;
|
||||
}
|
||||
|
||||
public static SAMRecord replaceSoftClipsWithMatches(SAMRecord read) {
|
||||
List<CigarElement> newCigarElements = new ArrayList<CigarElement>();
|
||||
|
||||
for ( CigarElement ce : read.getCigar().getCigarElements() ) {
|
||||
if ( ce.getOperator() == CigarOperator.SOFT_CLIP )
|
||||
newCigarElements.add(new CigarElement(ce.getLength(), CigarOperator.MATCH_OR_MISMATCH));
|
||||
else
|
||||
newCigarElements.add(ce);
|
||||
}
|
||||
|
||||
if ( newCigarElements.size() > 1 ) { //
|
||||
CigarElement first = newCigarElements.get(0);
|
||||
CigarElement second = newCigarElements.get(1);
|
||||
if ( first.getOperator() == CigarOperator.MATCH_OR_MISMATCH && second.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) {
|
||||
newCigarElements.set(0, new CigarElement(first.getLength() + second.getLength(), CigarOperator.MATCH_OR_MISMATCH));
|
||||
newCigarElements.remove(1);
|
||||
}
|
||||
}
|
||||
|
||||
if ( newCigarElements.size() > 1 ) { //
|
||||
CigarElement penult = newCigarElements.get(newCigarElements.size()-2);
|
||||
CigarElement last = newCigarElements.get(newCigarElements.size()-1);
|
||||
if ( penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH && penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) {
|
||||
newCigarElements.set(newCigarElements.size()-2, new CigarElement(penult.getLength() + last.getLength(), CigarOperator.MATCH_OR_MISMATCH));
|
||||
newCigarElements.remove(newCigarElements.size()-1);
|
||||
}
|
||||
}
|
||||
|
||||
read.setCigar(new Cigar(newCigarElements));
|
||||
return read;
|
||||
}
|
||||
|
||||
|
||||
private static int DEFAULT_ADAPTOR_SIZE = 100;
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,41 @@
|
|||
package org.broadinstitute.sting.utils;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.BeforeTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
|
||||
public class ReadUtilsUnitTest extends BaseTest {
|
||||
SAMRecord read;
|
||||
final static String BASES = "ACTG";
|
||||
final static String QUALS = "!+5?";
|
||||
|
||||
@BeforeTest
|
||||
public void init() {
|
||||
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000);
|
||||
read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length());
|
||||
read.setReadUnmappedFlag(true);
|
||||
read.setReadBases(new String(BASES).getBytes());
|
||||
read.setBaseQualityString(new String(QUALS));
|
||||
}
|
||||
|
||||
private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) {
|
||||
SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null);
|
||||
String expectedBases = BASES.substring(expectedStart, expectedStop);
|
||||
String expectedQuals = QUALS.substring(expectedStart, expectedStop);
|
||||
Assert.assertEquals(clipped.getReadBases(), expectedBases.getBytes(), "Clipped bases not those expected");
|
||||
Assert.assertEquals(clipped.getBaseQualityString(), expectedQuals, "Clipped quals not those expected");
|
||||
}
|
||||
|
||||
@Test public void testNoClip() { testReadBasesAndQuals(read, 0, 4); }
|
||||
@Test public void testClip1Front() { testReadBasesAndQuals(read, 1, 4); }
|
||||
@Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); }
|
||||
@Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); }
|
||||
@Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); }
|
||||
}
|
||||
Loading…
Reference in New Issue