ClipReads now supports HARDCLIP_BASES, though in fact this turned out to be not necessary for my desired tests. In the process of developing the HARDCLIP mode, I added some proper ReadUtils unit tests, which would ideally be expanded to include other ReadUtil functions, as added
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5890 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
549172af10
commit
136c8c7900
|
|
@ -47,6 +47,7 @@ import java.io.File;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
|
|
||||||
import net.sf.samtools.util.StringUtil;
|
import net.sf.samtools.util.StringUtil;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This ReadWalker provides simple, yet powerful read clipping capabilities. It allows the user to clip bases in reads
|
* This ReadWalker provides simple, yet powerful read clipping capabilities. It allows the user to clip bases in reads
|
||||||
|
|
@ -154,8 +155,10 @@ public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.Cli
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (outputBam != null)
|
if (outputBam != null) {
|
||||||
outputBam.setPresorted(clippingRepresentation != ClippingRepresentation.SOFTCLIP_BASES);
|
EnumSet<ClippingRepresentation> presorted = EnumSet.of(ClippingRepresentation.WRITE_NS, ClippingRepresentation.WRITE_NS_Q0S, ClippingRepresentation.WRITE_Q0S);
|
||||||
|
outputBam.setPresorted(presorted.contains(clippingRepresentation));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -179,6 +182,9 @@ public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.Cli
|
||||||
*/
|
*/
|
||||||
public ReadClipper map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
public ReadClipper map(ReferenceContext ref, SAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||||
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
|
if ( onlyDoRead == null || read.getReadName().equals(onlyDoRead) ) {
|
||||||
|
if ( clippingRepresentation == ClippingRepresentation.HARDCLIP_BASES ) {
|
||||||
|
read = ReadUtils.replaceSoftClipsWithMatches(read);
|
||||||
|
}
|
||||||
ReadClipper clipper = new ReadClipper(read);
|
ReadClipper clipper = new ReadClipper(read);
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
@ -323,10 +329,11 @@ public class ClipReadsWalker extends ReadWalker<ReadClipper, ClipReadsWalker.Cli
|
||||||
if ( clipper == null )
|
if ( clipper == null )
|
||||||
return data;
|
return data;
|
||||||
|
|
||||||
|
SAMRecord clippedRead = clipper.clipRead(clippingRepresentation);
|
||||||
if (outputBam != null) {
|
if (outputBam != null) {
|
||||||
outputBam.addAlignment(clipper.clipRead(clippingRepresentation));
|
outputBam.addAlignment(clippedRead);
|
||||||
} else {
|
} else {
|
||||||
out.println(clipper.clipRead(clippingRepresentation).format());
|
out.println(clippedRead.format());
|
||||||
}
|
}
|
||||||
|
|
||||||
data.nTotalReads++;
|
data.nTotalReads++;
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ import net.sf.samtools.SAMRecord;
|
||||||
import org.broadinstitute.sting.gatk.walkers.ClipReadsWalker;
|
import org.broadinstitute.sting.gatk.walkers.ClipReadsWalker;
|
||||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
|
||||||
import java.util.Vector;
|
import java.util.Vector;
|
||||||
|
|
||||||
|
|
@ -45,7 +46,7 @@ public class ClippingOp {
|
||||||
* @param algorithm
|
* @param algorithm
|
||||||
* @param clippedRead
|
* @param clippedRead
|
||||||
*/
|
*/
|
||||||
public void apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
|
public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
|
||||||
//clippedRead.setReferenceIndex(1);
|
//clippedRead.setReferenceIndex(1);
|
||||||
byte[] quals = clippedRead.getBaseQualities();
|
byte[] quals = clippedRead.getBaseQualities();
|
||||||
byte[] bases = clippedRead.getReadBases();
|
byte[] bases = clippedRead.getReadBases();
|
||||||
|
|
@ -72,6 +73,7 @@ public class ClippingOp {
|
||||||
clippedRead.setReadBases(bases);
|
clippedRead.setReadBases(bases);
|
||||||
clippedRead.setBaseQualities(quals);
|
clippedRead.setBaseQualities(quals);
|
||||||
break;
|
break;
|
||||||
|
case HARDCLIP_BASES:
|
||||||
case SOFTCLIP_BASES:
|
case SOFTCLIP_BASES:
|
||||||
if ( ! clippedRead.getReadUnmappedFlag() ) {
|
if ( ! clippedRead.getReadUnmappedFlag() ) {
|
||||||
// we can't process unmapped reads
|
// we can't process unmapped reads
|
||||||
|
|
@ -97,18 +99,6 @@ public class ClippingOp {
|
||||||
else
|
else
|
||||||
scRight = start;
|
scRight = start;
|
||||||
|
|
||||||
// if ( clippedRead.getReadNegativeStrandFlag() ) {
|
|
||||||
// if ( start == 0 )
|
|
||||||
// scLeft = myStop + 1;
|
|
||||||
// else
|
|
||||||
// scRight = start;
|
|
||||||
// } else {
|
|
||||||
// if ( start == 0 )
|
|
||||||
// scLeft = myStop;
|
|
||||||
// else
|
|
||||||
// scRight = start;
|
|
||||||
// }
|
|
||||||
|
|
||||||
Cigar newCigar = softClip(oldCigar, scLeft, scRight);
|
Cigar newCigar = softClip(oldCigar, scLeft, scRight);
|
||||||
clippedRead.setCigar(newCigar);
|
clippedRead.setCigar(newCigar);
|
||||||
|
|
||||||
|
|
@ -116,14 +106,22 @@ public class ClippingOp {
|
||||||
int newStart = clippedRead.getAlignmentStart() + newClippedStart;
|
int newStart = clippedRead.getAlignmentStart() + newClippedStart;
|
||||||
clippedRead.setAlignmentStart(newStart);
|
clippedRead.setAlignmentStart(newStart);
|
||||||
|
|
||||||
|
if ( algorithm == ClippingRepresentation.HARDCLIP_BASES )
|
||||||
|
clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead);
|
||||||
//System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart);
|
//System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart);
|
||||||
|
} else if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) {
|
||||||
|
// we can hard clip unmapped reads
|
||||||
|
if ( clippedRead.getReadNegativeStrandFlag() )
|
||||||
|
clippedRead = ReadUtils.hardClipBases(clippedRead, 0, start, null);
|
||||||
|
else
|
||||||
|
clippedRead = ReadUtils.hardClipBases(clippedRead, start, start + getLength(), null);
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
//throw new RuntimeException("Softclipping of bases not yet implemented.");
|
|
||||||
default:
|
default:
|
||||||
throw new IllegalStateException("Unexpected Clipping operator type " + algorithm);
|
throw new IllegalStateException("Unexpected Clipping operator type " + algorithm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return clippedRead;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -7,5 +7,6 @@ public enum ClippingRepresentation {
|
||||||
WRITE_NS, // change the bases to Ns
|
WRITE_NS, // change the bases to Ns
|
||||||
WRITE_Q0S, // change the quality scores to Q0
|
WRITE_Q0S, // change the quality scores to Q0
|
||||||
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
|
WRITE_NS_Q0S, // change the quality scores to Q0 and write Ns
|
||||||
SOFTCLIP_BASES // change cigar string to S, but keep bases
|
SOFTCLIP_BASES, // change cigar string to S, but keep bases
|
||||||
|
HARDCLIP_BASES // remove the bases from the read
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -58,7 +58,7 @@ public class ReadClipper {
|
||||||
try {
|
try {
|
||||||
SAMRecord clippedRead = (SAMRecord) read.clone();
|
SAMRecord clippedRead = (SAMRecord) read.clone();
|
||||||
for (ClippingOp op : getOps()) {
|
for (ClippingOp op : getOps()) {
|
||||||
op.apply(algorithm, clippedRead);
|
clippedRead = op.apply(algorithm, clippedRead);
|
||||||
}
|
}
|
||||||
return clippedRead;
|
return clippedRead;
|
||||||
} catch (CloneNotSupportedException e) {
|
} catch (CloneNotSupportedException e) {
|
||||||
|
|
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
|
|
||||||
package org.broadinstitute.sting.utils.sam;
|
package org.broadinstitute.sting.utils.sam;
|
||||||
|
|
||||||
|
import com.google.java.contract.*;
|
||||||
import net.sf.samtools.*;
|
import net.sf.samtools.*;
|
||||||
import org.broadinstitute.sting.utils.collections.Pair;
|
import org.broadinstitute.sting.utils.collections.Pair;
|
||||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||||
|
|
@ -370,23 +371,26 @@ public class ReadUtils {
|
||||||
* @param rec
|
* @param rec
|
||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
|
@Requires("rec != null")
|
||||||
|
@Ensures("result != null")
|
||||||
public static SAMRecord hardClipSoftClippedBases(SAMRecord rec) {
|
public static SAMRecord hardClipSoftClippedBases(SAMRecord rec) {
|
||||||
List<CigarElement> cigarElts = rec.getCigar().getCigarElements();
|
List<CigarElement> cigarElts = rec.getCigar().getCigarElements();
|
||||||
|
|
||||||
if ( cigarElts.size() == 1 ) // can't be soft clipped, just return
|
if ( cigarElts.size() == 1 ) // can't be soft clipped, just return
|
||||||
return rec;
|
return rec;
|
||||||
|
|
||||||
int basesStart = 0;
|
int keepStart = 0, keepEnd = rec.getReadLength() - 1;
|
||||||
List<CigarElement> newCigarElements = new LinkedList<CigarElement>();
|
List<CigarElement> newCigarElements = new LinkedList<CigarElement>();
|
||||||
int basesToClip = 0;
|
|
||||||
|
|
||||||
for ( int i = 0; i < cigarElts.size(); i++ ) {
|
for ( int i = 0; i < cigarElts.size(); i++ ) {
|
||||||
CigarElement ce = cigarElts.get(i);
|
CigarElement ce = cigarElts.get(i);
|
||||||
int l = ce.getLength();
|
int l = ce.getLength();
|
||||||
switch ( ce.getOperator() ) {
|
switch ( ce.getOperator() ) {
|
||||||
case S:
|
case S:
|
||||||
basesToClip += l;
|
if ( i == 0 )
|
||||||
if ( i == 0 ) basesStart += l;
|
keepStart = l;
|
||||||
|
else
|
||||||
|
keepEnd = rec.getReadLength() - l - 1;
|
||||||
newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP));
|
newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP));
|
||||||
break;
|
break;
|
||||||
case H:
|
case H:
|
||||||
|
|
@ -398,23 +402,46 @@ public class ReadUtils {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( basesToClip > 0 ) {
|
return hardClipBases(rec, keepStart, keepEnd, newCigarElements);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hard clips out the bases in rec, keeping the bases from keepStart to keepEnd, inclusive. Note these
|
||||||
|
* are offsets, so they are 0 based
|
||||||
|
*
|
||||||
|
* @param rec
|
||||||
|
* @param keepStart
|
||||||
|
* @param keepEnd
|
||||||
|
* @param newCigarElements
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
@Requires({
|
||||||
|
"rec != null",
|
||||||
|
"keepStart >= 0",
|
||||||
|
"keepEnd < rec.getReadLength()",
|
||||||
|
"rec.getReadUnmappedFlag() || newCigarElements != null"})
|
||||||
|
@Ensures("result != null")
|
||||||
|
public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd,
|
||||||
|
List<CigarElement> newCigarElements) {
|
||||||
|
int newLength = keepEnd - keepStart + 1;
|
||||||
|
if ( newLength != rec.getReadLength() ) {
|
||||||
try {
|
try {
|
||||||
rec = SimplifyingSAMFileWriter.simplifyRead((SAMRecord)rec.clone());
|
rec = SimplifyingSAMFileWriter.simplifyRead((SAMRecord)rec.clone());
|
||||||
// copy over the unclipped bases
|
// copy over the unclipped bases
|
||||||
final byte[] bases = rec.getReadBases();
|
final byte[] bases = rec.getReadBases();
|
||||||
final byte[] quals = rec.getBaseQualities();
|
final byte[] quals = rec.getBaseQualities();
|
||||||
int newLength = bases.length - basesToClip;
|
|
||||||
byte[] newBases = new byte[newLength];
|
byte[] newBases = new byte[newLength];
|
||||||
byte[] newQuals = new byte[newLength];
|
byte[] newQuals = new byte[newLength];
|
||||||
System.arraycopy(bases, basesStart, newBases, 0, newLength);
|
System.arraycopy(bases, keepStart, newBases, 0, newLength);
|
||||||
System.arraycopy(quals, basesStart, newQuals, 0, newLength);
|
System.arraycopy(quals, keepStart, newQuals, 0, newLength);
|
||||||
rec.setReadBases(newBases);
|
rec.setReadBases(newBases);
|
||||||
rec.setBaseQualities(newQuals);
|
rec.setBaseQualities(newQuals);
|
||||||
|
|
||||||
// now add a CIGAR element for the clipped bases
|
// now add a CIGAR element for the clipped bases, if the read isn't unmapped
|
||||||
|
if ( ! rec.getReadUnmappedFlag() ) {
|
||||||
Cigar newCigar = new Cigar(newCigarElements);
|
Cigar newCigar = new Cigar(newCigarElements);
|
||||||
rec.setCigar(newCigar);
|
rec.setCigar(newCigar);
|
||||||
|
}
|
||||||
} catch ( CloneNotSupportedException e ) {
|
} catch ( CloneNotSupportedException e ) {
|
||||||
throw new ReviewedStingException("WTF, where did clone go?", e);
|
throw new ReviewedStingException("WTF, where did clone go?", e);
|
||||||
}
|
}
|
||||||
|
|
@ -423,6 +450,39 @@ public class ReadUtils {
|
||||||
return rec;
|
return rec;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static SAMRecord replaceSoftClipsWithMatches(SAMRecord read) {
|
||||||
|
List<CigarElement> newCigarElements = new ArrayList<CigarElement>();
|
||||||
|
|
||||||
|
for ( CigarElement ce : read.getCigar().getCigarElements() ) {
|
||||||
|
if ( ce.getOperator() == CigarOperator.SOFT_CLIP )
|
||||||
|
newCigarElements.add(new CigarElement(ce.getLength(), CigarOperator.MATCH_OR_MISMATCH));
|
||||||
|
else
|
||||||
|
newCigarElements.add(ce);
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( newCigarElements.size() > 1 ) { //
|
||||||
|
CigarElement first = newCigarElements.get(0);
|
||||||
|
CigarElement second = newCigarElements.get(1);
|
||||||
|
if ( first.getOperator() == CigarOperator.MATCH_OR_MISMATCH && second.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) {
|
||||||
|
newCigarElements.set(0, new CigarElement(first.getLength() + second.getLength(), CigarOperator.MATCH_OR_MISMATCH));
|
||||||
|
newCigarElements.remove(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( newCigarElements.size() > 1 ) { //
|
||||||
|
CigarElement penult = newCigarElements.get(newCigarElements.size()-2);
|
||||||
|
CigarElement last = newCigarElements.get(newCigarElements.size()-1);
|
||||||
|
if ( penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH && penult.getOperator() == CigarOperator.MATCH_OR_MISMATCH ) {
|
||||||
|
newCigarElements.set(newCigarElements.size()-2, new CigarElement(penult.getLength() + last.getLength(), CigarOperator.MATCH_OR_MISMATCH));
|
||||||
|
newCigarElements.remove(newCigarElements.size()-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
read.setCigar(new Cigar(newCigarElements));
|
||||||
|
return read;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private static int DEFAULT_ADAPTOR_SIZE = 100;
|
private static int DEFAULT_ADAPTOR_SIZE = 100;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,41 @@
|
||||||
|
package org.broadinstitute.sting.utils;
|
||||||
|
|
||||||
|
import net.sf.samtools.SAMFileHeader;
|
||||||
|
import net.sf.samtools.SAMRecord;
|
||||||
|
import org.broadinstitute.sting.BaseTest;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||||
|
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||||
|
import org.testng.Assert;
|
||||||
|
import org.testng.annotations.BeforeClass;
|
||||||
|
import org.testng.annotations.BeforeTest;
|
||||||
|
import org.testng.annotations.Test;
|
||||||
|
|
||||||
|
|
||||||
|
public class ReadUtilsUnitTest extends BaseTest {
|
||||||
|
SAMRecord read;
|
||||||
|
final static String BASES = "ACTG";
|
||||||
|
final static String QUALS = "!+5?";
|
||||||
|
|
||||||
|
@BeforeTest
|
||||||
|
public void init() {
|
||||||
|
SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,1000);
|
||||||
|
read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length());
|
||||||
|
read.setReadUnmappedFlag(true);
|
||||||
|
read.setReadBases(new String(BASES).getBytes());
|
||||||
|
read.setBaseQualityString(new String(QUALS));
|
||||||
|
}
|
||||||
|
|
||||||
|
private void testReadBasesAndQuals(SAMRecord read, int expectedStart, int expectedStop) {
|
||||||
|
SAMRecord clipped = ReadUtils.hardClipBases(read, expectedStart, expectedStop - 1, null);
|
||||||
|
String expectedBases = BASES.substring(expectedStart, expectedStop);
|
||||||
|
String expectedQuals = QUALS.substring(expectedStart, expectedStop);
|
||||||
|
Assert.assertEquals(clipped.getReadBases(), expectedBases.getBytes(), "Clipped bases not those expected");
|
||||||
|
Assert.assertEquals(clipped.getBaseQualityString(), expectedQuals, "Clipped quals not those expected");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test public void testNoClip() { testReadBasesAndQuals(read, 0, 4); }
|
||||||
|
@Test public void testClip1Front() { testReadBasesAndQuals(read, 1, 4); }
|
||||||
|
@Test public void testClip2Front() { testReadBasesAndQuals(read, 2, 4); }
|
||||||
|
@Test public void testClip1Back() { testReadBasesAndQuals(read, 0, 3); }
|
||||||
|
@Test public void testClip2Back() { testReadBasesAndQuals(read, 0, 2); }
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue