diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index b9aaf47de..286e22369 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -333,10 +333,6 @@ public class RefMetaDataTracker { return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); } @Deprecated - public List getValues(final Class type, final Collection names, final GenomeLoc onlyAtThisLoc) { - return addValues(names, type, new ArrayList(), onlyAtThisLoc, true, false); - } - @Deprecated public T getFirstValue(final Class type, final String name) { return safeGetFirst(getValues(type, name)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index ca4e3f5e3..76b0276cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation; import org.broadinstitute.sting.utils.clipreads.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.yaml.snakeyaml.events.SequenceStartEvent; import java.io.File; import java.io.PrintStream; @@ -54,7 +55,7 @@ import java.util.regex.Pattern; * with poor quality scores, that match particular sequences, or that were generated by particular machine cycles. */ @Requires({DataSource.READS}) -public class ClipReadsWalker extends ReadWalker { +public class ClipReadsWalker extends ReadWalker { @Output PrintStream out; @@ -180,12 +181,12 @@ public class ClipReadsWalker extends ReadWalker p : cyclesToClip) { // iterate over each cycle range int cycleStart = p.first; @@ -270,10 +276,13 @@ public class ClipReadsWalker extends ReadWalker clipSeqs) { + super(read); + data = new ClippingData(clipSeqs); + } + + public ClippingData getData() { + return data; + } + + public void setData(ClippingData data) { + this.data = data; + } + + public void addData(ClippingData data) { + this.data.addData(data); + } + } + + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index 60f9724e8..a57fabc39 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.Collections; import java.util.List; @@ -49,7 +50,7 @@ import java.util.List; public class FastaAlternateReferenceWalker extends FastaReferenceWalker { @Input(fullName = "variant", shortName = "V", doc="variants to model", required=false) - public List> variants; + public List> variants = Collections.emptyList(); @Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false) public RodBinding snpmask; @@ -66,17 +67,18 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker { String refBase = String.valueOf((char)ref.getBase()); // Check to see if we have a called snp - for ( VariantContext vc : tracker.getValues(VariantContext.class) ) { - if ( ! vc.getSource().equals(snpmask.getName())) { - if ( vc.isDeletion()) { - deletionBasesRemaining = vc.getReference().length(); - // delete the next n bases, not this one - return new Pair(context.getLocation(), refBase); - } else if ( vc.isInsertion()) { - return new Pair(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString())); - } else if (vc.isSNP()) { - return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); - } + for ( VariantContext vc : tracker.getValues(variants) ) { + if ( vc.isFiltered() ) + continue; + + if ( vc.isDeletion()) { + deletionBasesRemaining = vc.getReference().length(); + // delete the next n bases, not this one + return new Pair(context.getLocation(), refBase); + } else if ( vc.isInsertion()) { + return new Pair(context.getLocation(), refBase.concat(vc.getAlternateAllele(0).toString())); + } else if (vc.isSNP()) { + return new Pair(context.getLocation(), vc.getAlternateAllele(0).toString()); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java index 640cff2ba..29b649afe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java @@ -2,16 +2,18 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import java.io.PrintStream; +import java.util.Collections; import java.util.List; /** @@ -23,6 +25,9 @@ public class CountIntervals extends RefWalker { @Output PrintStream out; + @Input(fullName="check", shortName = "check", doc="Any number of RODs", required=false) + public List> features = Collections.emptyList(); + @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false) int numOverlaps = 2; @@ -37,7 +42,7 @@ public class CountIntervals extends RefWalker { return null; } - List checkIntervals = tracker.getValues(Feature.class, "check"); + List checkIntervals = tracker.getValues(features); return (long) checkIntervals.size(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index 5449906b2..7d351686f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -4,6 +4,8 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.Vector; @@ -16,22 +18,14 @@ import java.util.Vector; * according to the wishes of the supplid ClippingAlgorithm enum. */ public class ClippingOp { - public final ClippingType type; public final int start, stop; // inclusive - public final Object extraInfo; public ClippingOp(int start, int stop) { - this(null, start, stop, null); - } - - public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) { - // todo -- remove type and extra info - this.type = type; this.start = start; this.stop = stop; - this.extraInfo = extraInfo; } + public int getLength() { return stop - start + 1; } @@ -72,48 +66,45 @@ public class ClippingOp { break; case HARDCLIP_BASES: case SOFTCLIP_BASES: - if ( ! clippedRead.getReadUnmappedFlag() ) { + if ( clippedRead.getReadUnmappedFlag() ) { // we can't process unmapped reads - - //System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength()); - int myStop = stop; - if ( (stop + 1 - start) == clippedRead.getReadLength() ) { - // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone - //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName())); - //break; - myStop--; // just decrement stop - } - - if ( start > 0 && myStop != clippedRead.getReadLength() - 1 ) - throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", - clippedRead.getReadName(), start, myStop)); - - Cigar oldCigar = clippedRead.getCigar(); - - int scLeft = 0, scRight = clippedRead.getReadLength(); - if ( start == 0 ) - scLeft = myStop + 1; - else - scRight = start; - - Cigar newCigar = softClip(oldCigar, scLeft, scRight); - clippedRead.setCigar(newCigar); - - int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); - int newStart = clippedRead.getAlignmentStart() + newClippedStart; - clippedRead.setAlignmentStart(newStart); - - if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) - clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead); - //System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); - } else if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) { - // we can hard clip unmapped reads - if ( clippedRead.getReadNegativeStrandFlag() ) - clippedRead = ReadUtils.hardClipBases(clippedRead, 0, start, null); - else - clippedRead = ReadUtils.hardClipBases(clippedRead, start, start + getLength(), null); + throw new UserException("Read Clipper cannot soft/hard clip unmapped reads"); } + + //System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength()); + int myStop = stop; + if ( (stop + 1 - start) == clippedRead.getReadLength() ) { + // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone + //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName())); + //break; + myStop--; // just decrement stop + } + + if ( start > 0 && myStop != clippedRead.getReadLength() - 1 ) + throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", + clippedRead.getReadName(), start, myStop)); + + Cigar oldCigar = clippedRead.getCigar(); + + int scLeft = 0, scRight = clippedRead.getReadLength(); + if ( start == 0 ) + scLeft = myStop + 1; + else + scRight = start; + + Cigar newCigar = softClip(oldCigar, scLeft, scRight); + clippedRead.setCigar(newCigar); + + int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); + int newStart = clippedRead.getAlignmentStart() + newClippedStart; + clippedRead.setAlignmentStart(newStart); + + if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) + clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead); + //System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); + break; + default: throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); } @@ -121,15 +112,6 @@ public class ClippingOp { return clippedRead; } - /** - * What is the type of a ClippingOp? - */ - public enum ClippingType { - LOW_Q_SCORES, - WITHIN_CLIP_RANGE, - MATCHES_CLIP_SEQ - } - /** * Given a cigar string, get the number of bases hard or soft clipped at the start */ @@ -198,7 +180,7 @@ public class ClippingOp { Vector newElements = new Vector(); for (CigarElement curElem : __cigar.getCigarElements()) { if (!curElem.getOperator().consumesReadBases()) { - if (curLength > __startClipEnd && curLength < __endClipBegin) { + if (curElem.getOperator() == CigarOperator.HARD_CLIP || curLength > __startClipEnd && curLength < __endClipBegin) { newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); } continue; diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 988d297f6..1307f1b6f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -1,6 +1,10 @@ package org.broadinstitute.sting.utils.clipreads; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.jets3t.service.multi.ThreadedStorageService; import java.util.ArrayList; import java.util.List; @@ -43,6 +47,23 @@ public class ReadClipper { return read; } + public SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { + int start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart); + int stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop); + this.addOp(new ClippingOp(start, stop)); + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + + public SAMRecord hardClipByReadCoordinates(int start, int stop) { + this.addOp(new ClippingOp(start, stop)); + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + + public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { + this.read = hardClipByReferenceCoordinates(read.getUnclippedStart(), left); + this.ops = null; // reset the operations + return hardClipByReferenceCoordinates(right, read.getUnclippedEnd()); + } /** * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 6c15910b1..3d803804a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -112,7 +113,42 @@ public class ReadUtils { * @version 0.1 */ - public enum OverlapType { NOT_OVERLAPPING, IN_ADAPTOR } + public enum OverlapType { NOT_OVERLAPPING, IN_ADAPTOR} + + /** + * This enum represents all the different ways in which a read can overlap an interval. + * + * NO_OVERLAP: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * LEFT_OVERLAP: + * the read starts before the beginning of the interval but ends inside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * RIGHT_OVERLAP: + * the read starts inside the interval but ends outside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * FULL_OVERLAP: + * the read starts before the interval and ends after the interval + * + * |-----------| (interval) + * <-------------------> (read) + * + * CONTAINED: + * the read starts and ends inside the interval + * + * |----------------| (interval) + * <--------> (read) + */ + public enum ReadAndIntervalOverlap {NO_OVERLAP, LEFT_OVERLAP, RIGHT_OVERLAP, FULL_OVERLAP, CONTAINED} /** * God, there's a huge information asymmetry in SAM format: @@ -396,16 +432,34 @@ public class ReadUtils { keepEnd = rec.getReadLength() - l - 1; newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP)); break; - case H: - // TODO -- must be handled specially - throw new ReviewedStingException("BUG: tell mark he forgot to implement this"); + default: newCigarElements.add(ce); break; } } - return hardClipBases(rec, keepStart, keepEnd, newCigarElements); + // Merges tandem cigar elements like 5H10H or 2S5S to 15H or 7S + // this will happen if you soft clip a read that has been hard clipped before + // like: 5H20S => 5H20H + List mergedCigarElements = new LinkedList(); + Iterator cigarElementIterator = newCigarElements.iterator(); + CigarOperator currentOperator = null; + int currentOperatorLength = 0; + while (cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + if (currentOperator != cigarElement.getOperator()) { + if (currentOperator != null) + mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); + currentOperator = cigarElement.getOperator(); + currentOperatorLength = cigarElement.getLength(); + } + else + currentOperatorLength += cigarElement.getLength(); + } + mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); + + return hardClipBases(rec, keepStart, keepEnd, mergedCigarElements); } /** @@ -424,8 +478,7 @@ public class ReadUtils { "keepEnd < rec.getReadLength()", "rec.getReadUnmappedFlag() || newCigarElements != null"}) @Ensures("result != null") - public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, - List newCigarElements) { + public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, List newCigarElements) { int newLength = keepEnd - keepStart + 1; if ( newLength != rec.getReadLength() ) { try { @@ -569,7 +622,71 @@ public class ReadUtils { return 0; } + /** + * Determines what is the position of the read in relation to the interval. + * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. + * @param read the read + * @param interval the interval + * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) + */ + public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) { + if ( (!read.getReferenceName().equals(interval.getContig())) || + (read.getUnclippedEnd() < interval.getStart()) || + (read.getUnclippedStart() > interval.getStop()) ) + return ReadAndIntervalOverlap.NO_OVERLAP; + else if ( (read.getUnclippedStart() >= interval.getStart()) && + (read.getUnclippedEnd() <= interval.getStop()) ) + return ReadAndIntervalOverlap.CONTAINED; + + else if ( (read.getUnclippedStart() < interval.getStart()) && + (read.getUnclippedEnd() > interval.getStop()) ) + return ReadAndIntervalOverlap.FULL_OVERLAP; + + else if ( (read.getAlignmentStart() < interval.getStart()) ) + return ReadAndIntervalOverlap.LEFT_OVERLAP; + + else + return ReadAndIntervalOverlap.RIGHT_OVERLAP; + } + + @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) + @Ensures({"result >= 0", "result < read.getReadLength()"}) + public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { + int readBases = 0; + int refBases = 0; + int goal = refCoord - read.getUnclippedStart(); // read coords are 0-based! + boolean goalReached = false; + + Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; + if (refBases == 0 && readBases == 0 && cigarElement.getOperator() == CigarOperator.HARD_CLIP) { + goal -= cigarElement.getLength(); + } + + if (cigarElement.getOperator().consumesReferenceBases()) { + if (refBases + cigarElement.getLength() < goal) { + shift = cigarElement.getLength(); + } + else { + shift = goal - refBases; + } + refBases += shift; + } + goalReached = refBases == goal; + + if (cigarElement.getOperator().consumesReadBases()) { + readBases += goalReached ? shift : cigarElement.getLength(); + } + } + + if (!goalReached) + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + + return readBases; + } }