Merge branch 'master' of ssh://nickel.broadinstitute.org/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Guillermo del Angel 2011-08-23 10:51:24 -04:00
commit bcc0cae89e
3 changed files with 245 additions and 53 deletions

View File

@ -1,13 +1,19 @@
package org.broadinstitute.sting.utils.clipreads;
import com.google.java.contract.Requires;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMRecord;
import org.apache.poi.hssf.record.PageBreakRecord;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.io.PipedOutputStream;
import java.lang.reflect.Array;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;
/**
@ -31,16 +37,15 @@ public class ClippingOp {
}
/**
* Clips the bases in clippedRead according to this operation's start and stop. Uses the clipping
* Clips the bases in read according to this operation's start and stop. Uses the clipping
* representation used is the one provided by algorithm argument.
*
* @param algorithm
* @param clippedRead
* @param read
*/
public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord clippedRead) {
//clippedRead.setReferenceIndex(1);
byte[] quals = clippedRead.getBaseQualities();
byte[] bases = clippedRead.getReadBases();
public SAMRecord apply(ClippingRepresentation algorithm, SAMRecord read) {
byte[] quals = read.getBaseQualities();
byte[] bases = read.getReadBases();
switch (algorithm) {
// important note:
@ -49,59 +54,57 @@ public class ClippingOp {
case WRITE_NS:
for (int i = start; i <= stop; i++)
bases[i] = 'N';
clippedRead.setReadBases(bases);
read.setReadBases(bases);
break;
case WRITE_Q0S:
for (int i = start; i <= stop; i++)
quals[i] = 0;
clippedRead.setBaseQualities(quals);
read.setBaseQualities(quals);
break;
case WRITE_NS_Q0S:
for (int i = start; i <= stop; i++) {
bases[i] = 'N';
quals[i] = 0;
}
clippedRead.setReadBases(bases);
clippedRead.setBaseQualities(quals);
read.setReadBases(bases);
read.setBaseQualities(quals);
break;
case HARDCLIP_BASES:
read = hardClip(read, start, stop);
break;
case SOFTCLIP_BASES:
if ( clippedRead.getReadUnmappedFlag() ) {
if ( read.getReadUnmappedFlag() ) {
// we can't process unmapped reads
throw new UserException("Read Clipper cannot soft/hard clip unmapped reads");
throw new UserException("Read Clipper cannot soft clip unmapped reads");
}
//System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength());
//System.out.printf("%d %d %d%n", stop, start, read.getReadLength());
int myStop = stop;
if ( (stop + 1 - start) == clippedRead.getReadLength() ) {
if ( (stop + 1 - start) == read.getReadLength() ) {
// BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone
//Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName()));
//Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", read.getReadName()));
//break;
myStop--; // just decrement stop
}
if ( start > 0 && myStop != clippedRead.getReadLength() - 1 )
throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d",
clippedRead.getReadName(), start, myStop));
if ( start > 0 && myStop != read.getReadLength() - 1 )
throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", read.getReadName(), start, myStop));
Cigar oldCigar = clippedRead.getCigar();
Cigar oldCigar = read.getCigar();
int scLeft = 0, scRight = clippedRead.getReadLength();
int scLeft = 0, scRight = read.getReadLength();
if ( start == 0 )
scLeft = myStop + 1;
else
scRight = start;
Cigar newCigar = softClip(oldCigar, scLeft, scRight);
clippedRead.setCigar(newCigar);
read.setCigar(newCigar);
int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar);
int newStart = clippedRead.getAlignmentStart() + newClippedStart;
clippedRead.setAlignmentStart(newStart);
if ( algorithm == ClippingRepresentation.HARDCLIP_BASES )
clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead);
//System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart);
int newStart = read.getAlignmentStart() + newClippedStart;
read.setAlignmentStart(newStart);
break;
@ -109,7 +112,7 @@ public class ClippingOp {
throw new IllegalStateException("Unexpected Clipping operator type " + algorithm);
}
return clippedRead;
return read;
}
/**
@ -247,4 +250,135 @@ public class ClippingOp {
assert newCigar.isValid(null, -1) == null;
return newCigar;
}
@Requires({"start <= stop", "start == 0 || stop == read.getReadLength() - 1", "!read.getReadUnmappedFlag()"})
private SAMRecord hardClip (SAMRecord read, int start, int stop) {
if (start == 0 && stop == read.getReadLength() -1)
return new SAMRecord(read.getHeader());
int newLength = read.getReadLength() - (stop - start + 1);
byte [] newBases = new byte[newLength];
byte [] newQuals = new byte[newLength];
int copyStart = (start == 0) ? stop + 1 : 0;
System.arraycopy(read.getReadBases(), copyStart, newBases, 0, newLength);
System.arraycopy(read.getBaseQualities(), copyStart, newQuals, 0, newLength); Cigar newCigar = hardClipCigar(read.getCigar(), start, stop);
SAMRecord hardClippedRead;
try {
hardClippedRead = (SAMRecord) read.clone();
} catch (CloneNotSupportedException e) {
throw new ReviewedStingException("Where did the clone go?");
}
hardClippedRead.setBaseQualities(newQuals);
hardClippedRead.setReadBases(newBases);
hardClippedRead.setCigar(newCigar);
if (start == 0)
hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), newCigar));
return hardClippedRead;
}
@Requires({"!cigar.isEmpty()"})
private Cigar hardClipCigar (Cigar cigar, int start, int stop) {
Cigar newCigar = new Cigar();
int index = 0;
int totalHardClipCount = stop - start + 1;
// hard clip the beginning of the cigar string
if (start == 0) {
Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
CigarElement cigarElement = cigarElementIterator.next();
// Skip all leading hard clips
while (cigarElement.getOperator() == CigarOperator.HARD_CLIP) {
totalHardClipCount += cigarElement.getLength();
if (cigarElementIterator.hasNext())
cigarElement = cigarElementIterator.next();
else
throw new ReviewedStingException("Read is entirely hardclipped, shouldn't be trying to clip it's cigar string");
}
// keep clipping until we hit stop
while (index <= stop) {
int shift = 0;
if (cigarElement.getOperator().consumesReadBases())
shift = cigarElement.getLength();
// we're still clipping or just finished perfectly
if (index + shift == stop + 1) {
newCigar.add(new CigarElement(totalHardClipCount, CigarOperator.HARD_CLIP));
}
// element goes beyond what we need to clip
else if (index + shift > stop + 1) {
int elementLengthAfterChopping = cigarElement.getLength() - (stop - index + 1);
newCigar.add(new CigarElement(totalHardClipCount, CigarOperator.HARD_CLIP));
newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator()));
}
index += shift;
if (index <= stop && cigarElementIterator.hasNext())
cigarElement = cigarElementIterator.next();
}
// add the remaining cigar elements
while (cigarElementIterator.hasNext()) {
cigarElement = cigarElementIterator.next();
newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator()));
}
}
// hard clip the end of the cigar string
else {
Iterator<CigarElement> cigarElementIterator = cigar.getCigarElements().iterator();
CigarElement cigarElement = cigarElementIterator.next();
// Keep marching on until we find the start
while (index < start) {
int shift = 0;
if (cigarElement.getOperator().consumesReadBases())
shift = cigarElement.getLength();
// we haven't gotten to the start yet, keep everything as is.
if (index + shift < start)
newCigar.add(new CigarElement(cigarElement.getLength(), cigarElement.getOperator()));
// element goes beyond our clip starting position
else {
int elementLengthAfterChopping = start - index;
// if this last element is a HARD CLIP operator, just merge it with our hard clip operator to be added later
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP)
totalHardClipCount += elementLengthAfterChopping;
// otherwise, maintain what's left of this last operator
else
newCigar.add(new CigarElement(elementLengthAfterChopping, cigarElement.getOperator()));
}
index += shift;
if (index < start && cigarElementIterator.hasNext())
cigarElement = cigarElementIterator.next();
}
newCigar.add(new CigarElement(totalHardClipCount, CigarOperator.HARD_CLIP));
}
return newCigar;
}
private int calculateAlignmentStartShift(Cigar oldCigar, Cigar newCigar) {
int shift = 0;
// Rewind to previous start (by counting everything that was already clipped in this read)
for (CigarElement cigarElement : oldCigar.getCigarElements()) {
if (!cigarElement.getOperator().consumesReferenceBases())
shift -= cigarElement.getLength();
else
break;
}
// Advance to new start (by counting everything new that has been clipped )
for (CigarElement cigarElement : newCigar.getCigarElements()) {
if (!cigarElement.getOperator().consumesReferenceBases())
shift += cigarElement.getLength();
else
break;
}
return shift;
}
}

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.utils.clipreads;
import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement;
import net.sf.samtools.SAMRecord;
import org.broad.tribble.util.PositionalStream;
import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.jets3t.service.multi.ThreadedStorageService;
@ -14,6 +15,7 @@ import java.util.List;
*/
public class ReadClipper {
SAMRecord read;
boolean wasClipped;
List<ClippingOp> ops = null;
/**
@ -23,6 +25,7 @@ public class ReadClipper {
*/
public ReadClipper(final SAMRecord read) {
this.read = read;
this.wasClipped = false;
}
/**
@ -40,7 +43,7 @@ public class ReadClipper {
}
public boolean wasClipped() {
return ops != null;
return wasClipped;
}
public SAMRecord getRead() {
@ -48,8 +51,10 @@ public class ReadClipper {
}
public SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) {
int start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart);
int stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop);
int start = (refStart < 0) ? 0 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart);
int stop = (refStop < 0) ? read.getReadLength() - 1 : ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop);
System.out.println("Clipping start/stop: " + start + "/" + stop);
this.addOp(new ClippingOp(start, stop));
return clipRead(ClippingRepresentation.HARDCLIP_BASES);
}
@ -60,9 +65,31 @@ public class ReadClipper {
}
public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) {
this.read = hardClipByReferenceCoordinates(read.getUnclippedStart(), left);
this.read = hardClipByReferenceCoordinates(-1, left);
this.ops = null; // reset the operations
return hardClipByReferenceCoordinates(right, read.getUnclippedEnd());
return hardClipByReferenceCoordinates(right, -1);
}
public SAMRecord hardClipLowQualEnds(byte lowQual) {
byte [] quals = read.getBaseQualities();
int leftClipIndex = 0;
int rightClipIndex = read.getReadLength() - 1;
// check how far we can clip both sides
while (rightClipIndex >= 0 && quals[rightClipIndex] <= lowQual) rightClipIndex--;
while (leftClipIndex < read.getReadLength() && quals[leftClipIndex] <= lowQual) leftClipIndex++;
// if the entire read should be clipped, then return an empty read. (--todo: maybe null is better? testing this for now)
if (leftClipIndex > rightClipIndex)
return (new SAMRecord(read.getHeader()));
if (rightClipIndex < read.getReadLength() - 1) {
this.addOp(new ClippingOp(rightClipIndex + 1, read.getReadLength() - 1));
}
if (leftClipIndex > 0 ) {
this.addOp(new ClippingOp(0, leftClipIndex - 1));
}
return this.clipRead(ClippingRepresentation.HARDCLIP_BASES);
}
/**
@ -80,6 +107,7 @@ public class ReadClipper {
for (ClippingOp op : getOps()) {
clippedRead = op.apply(algorithm, clippedRead);
}
wasClipped = true;
return clippedRead;
} catch (CloneNotSupportedException e) {
throw new RuntimeException(e); // this should never happen

View File

@ -148,7 +148,7 @@ public class ReadUtils {
* |----------------| (interval)
* <--------> (read)
*/
public enum ReadAndIntervalOverlap {NO_OVERLAP, LEFT_OVERLAP, RIGHT_OVERLAP, FULL_OVERLAP, CONTAINED}
public enum ReadAndIntervalOverlap {NO_OVERLAP_CONTIG, NO_OVERLAP_LEFT, NO_OVERLAP_RIGHT, OVERLAP_LEFT, OVERLAP_RIGHT, OVERLAP_LEFT_AND_RIGHT, OVERLAP_CONTAINED}
/**
* God, there's a huge information asymmetry in SAM format:
@ -630,41 +630,71 @@ public class ReadUtils {
* @return the overlap type as described by ReadAndIntervalOverlap enum (see above)
*/
public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) {
if ( (!read.getReferenceName().equals(interval.getContig())) ||
(read.getUnclippedEnd() < interval.getStart()) ||
(read.getUnclippedStart() > interval.getStop()) )
return ReadAndIntervalOverlap.NO_OVERLAP;
else if ( (read.getUnclippedStart() >= interval.getStart()) &&
(read.getUnclippedEnd() <= interval.getStop()) )
return ReadAndIntervalOverlap.CONTAINED;
int start = getRefCoordSoftUnclippedStart(read);
int stop = getRefCoordSoftUnclippedStop(read);
else if ( (read.getUnclippedStart() < interval.getStart()) &&
(read.getUnclippedEnd() > interval.getStop()) )
return ReadAndIntervalOverlap.FULL_OVERLAP;
if ( !read.getReferenceName().equals(interval.getContig()) )
return ReadAndIntervalOverlap.NO_OVERLAP_CONTIG;
else if ( (read.getAlignmentStart() < interval.getStart()) )
return ReadAndIntervalOverlap.LEFT_OVERLAP;
else if ( stop < interval.getStart() )
return ReadAndIntervalOverlap.NO_OVERLAP_LEFT;
else if ( start > interval.getStop() )
return ReadAndIntervalOverlap.NO_OVERLAP_RIGHT;
else if ( (start >= interval.getStart()) &&
(stop <= interval.getStop()) )
return ReadAndIntervalOverlap.OVERLAP_CONTAINED;
else if ( (start < interval.getStart()) &&
(stop > interval.getStop()) )
return ReadAndIntervalOverlap.OVERLAP_LEFT_AND_RIGHT;
else if ( (start < interval.getStart()) )
return ReadAndIntervalOverlap.OVERLAP_LEFT;
else
return ReadAndIntervalOverlap.RIGHT_OVERLAP;
return ReadAndIntervalOverlap.OVERLAP_RIGHT;
}
@Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"})
public static int getRefCoordSoftUnclippedStart(SAMRecord read) {
int start = read.getUnclippedStart();
for (CigarElement cigarElement : read.getCigar().getCigarElements()) {
if (cigarElement.getOperator() == CigarOperator.HARD_CLIP)
start += cigarElement.getLength();
else
break;
}
return start;
}
public static int getRefCoordSoftUnclippedStop(SAMRecord read) {
int stop = read.getAlignmentEnd();
List<CigarElement> cigarElementList = read.getCigar().getCigarElements();
CigarElement lastCigarElement = cigarElementList.get(cigarElementList.size()-1);
if (lastCigarElement.getOperator() == CigarOperator.SOFT_CLIP)
stop += lastCigarElement.getLength();
return stop;
}
@Requires({"refCoord >= read.getAlignmentStart()", "refCoord <= read.getAlignmentEnd()"})
@Ensures({"result >= 0", "result < read.getReadLength()"})
public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) {
int readBases = 0;
int refBases = 0;
int goal = refCoord - read.getUnclippedStart(); // read coords are 0-based!
boolean goalReached = false;
int goal = refCoord - read.getAlignmentStart(); // The goal is to move this many reference bases
boolean goalReached = refBases == goal;
Iterator<CigarElement> cigarElementIterator = read.getCigar().getCigarElements().iterator();
while (!goalReached && cigarElementIterator.hasNext()) {
CigarElement cigarElement = cigarElementIterator.next();
int shift = 0;
if (refBases == 0 && readBases == 0 && cigarElement.getOperator() == CigarOperator.HARD_CLIP) {
goal -= cigarElement.getLength();
}
//if (refBases == 0 && readBases == 0 && cigarElement.getOperator() == CigarOperator.HARD_CLIP) {
// goal -= cigarElement.getLength();
//}
if (cigarElement.getOperator().consumesReferenceBases()) {
if (refBases + cigarElement.getLength() < goal) {