From 0be1dacddb6b699316aa3a3280f9166cd1685da4 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 13 Aug 2011 19:33:53 -0400 Subject: [PATCH 04/17] Refactored interval clipping utility reads are clipped in map() and now we cover almost all cases. Left behind the case where the read stretches through two intervals. This will need special treatment later. --- .../sting/utils/sam/ReadUtils.java | 66 ++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 6c15910b1..448fc828f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.utils.sam; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -112,7 +113,42 @@ public class ReadUtils { * @version 0.1 */ - public enum OverlapType { NOT_OVERLAPPING, IN_ADAPTOR } + public enum OverlapType { NOT_OVERLAPPING, IN_ADAPTOR} + + /** + * This enum represents all the different ways in which a read can overlap an interval. + * + * NO_OVERLAP: + * the read does not overlap the interval. + * + * |----------------| (interval) + * <----------------> (read) + * + * LEFT_OVERLAP: + * the read starts before the beginning of the interval but ends inside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * RIGHT_OVERLAP: + * the read starts inside the interval but ends outside of it + * + * |----------------| (interval) + * <----------------> (read) + * + * FULL_OVERLAP: + * the read starts before the interval and ends after the interval + * + * |-----------| (interval) + * <-------------------> (read) + * + * CONTAINED: + * the read starts and ends inside the interval + * + * |----------------| (interval) + * <--------> (read) + */ + public enum ReadAndIntervalOverlap {NO_OVERLAP, LEFT_OVERLAP, RIGHT_OVERLAP, FULL_OVERLAP, CONTAINED} /** * God, there's a huge information asymmetry in SAM format: @@ -569,6 +605,34 @@ public class ReadUtils { return 0; } + /** + * Determines what is the position of the read in relation to the interval. + * Note: This function uses the UNCLIPPED ENDS of the reads for the comparison. + * @param read the read + * @param interval the interval + * @return the overlap type as described by ReadAndIntervalOverlap enum (see above) + */ + public static ReadAndIntervalOverlap getReadAndIntervalOverlapType(SAMRecord read, GenomeLoc interval) { + if ( (!read.getReferenceName().equals(interval.getContig())) || + (read.getUnclippedEnd() < interval.getStart()) || + (read.getUnclippedStart() > interval.getStop()) ) + return ReadAndIntervalOverlap.NO_OVERLAP; + + else if ( (read.getUnclippedStart() > interval.getStart()) && + (read.getUnclippedEnd() < interval.getStop()) ) + return ReadAndIntervalOverlap.CONTAINED; + + else if ( (read.getUnclippedStart() < interval.getStart()) && + (read.getUnclippedEnd() > interval.getStop()) ) + return ReadAndIntervalOverlap.FULL_OVERLAP; + + else if ( (read.getAlignmentStart() < interval.getStart()) ) + return ReadAndIntervalOverlap.LEFT_OVERLAP; + + else + return ReadAndIntervalOverlap.RIGHT_OVERLAP; + } + From 291d8c7596150bd4e91a94eeef8ed35e758286a1 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 13 Aug 2011 21:02:24 -0400 Subject: [PATCH 05/17] Fixed HardClipping and Interval containment * Hard clipping was wrongfully hard clipping unmapped reads while soft clipping then hard clipping mapped reads. Now we throw exception if we try to hard/soft clip unmapped reads and use the soft->hard clip procedure fore every mapped read. * Interval containment needed a <= and >= to make sure it caught the borders right. --- .../sting/utils/clipreads/ClippingOp.java | 76 +++++++++---------- .../sting/utils/sam/ReadUtils.java | 4 +- 2 files changed, 39 insertions(+), 41 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index 5449906b2..5789b606a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -4,6 +4,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.util.Vector; @@ -72,48 +73,45 @@ public class ClippingOp { break; case HARDCLIP_BASES: case SOFTCLIP_BASES: - if ( ! clippedRead.getReadUnmappedFlag() ) { + if ( clippedRead.getReadUnmappedFlag() ) { // we can't process unmapped reads - - //System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength()); - int myStop = stop; - if ( (stop + 1 - start) == clippedRead.getReadLength() ) { - // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone - //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName())); - //break; - myStop--; // just decrement stop - } - - if ( start > 0 && myStop != clippedRead.getReadLength() - 1 ) - throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", - clippedRead.getReadName(), start, myStop)); - - Cigar oldCigar = clippedRead.getCigar(); - - int scLeft = 0, scRight = clippedRead.getReadLength(); - if ( start == 0 ) - scLeft = myStop + 1; - else - scRight = start; - - Cigar newCigar = softClip(oldCigar, scLeft, scRight); - clippedRead.setCigar(newCigar); - - int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); - int newStart = clippedRead.getAlignmentStart() + newClippedStart; - clippedRead.setAlignmentStart(newStart); - - if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) - clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead); - //System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); - } else if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) { - // we can hard clip unmapped reads - if ( clippedRead.getReadNegativeStrandFlag() ) - clippedRead = ReadUtils.hardClipBases(clippedRead, 0, start, null); - else - clippedRead = ReadUtils.hardClipBases(clippedRead, start, start + getLength(), null); + throw new UserException("Read Clipper cannot soft/hard clip unmapped reads"); } + + //System.out.printf("%d %d %d%n", stop, start, clippedRead.getReadLength()); + int myStop = stop; + if ( (stop + 1 - start) == clippedRead.getReadLength() ) { + // BAM representation issue -- we can't SOFTCLIP away all bases in a read, just leave it alone + //Walker.logger.info(String.format("Warning, read %s has all bases clip but this can't be represented with SOFTCLIP_BASES, just leaving it alone", clippedRead.getReadName())); + //break; + myStop--; // just decrement stop + } + + if ( start > 0 && myStop != clippedRead.getReadLength() - 1 ) + throw new RuntimeException(String.format("Cannot apply soft clipping operator to the middle of a read: %s to be clipped at %d-%d", + clippedRead.getReadName(), start, myStop)); + + Cigar oldCigar = clippedRead.getCigar(); + + int scLeft = 0, scRight = clippedRead.getReadLength(); + if ( start == 0 ) + scLeft = myStop + 1; + else + scRight = start; + + Cigar newCigar = softClip(oldCigar, scLeft, scRight); + clippedRead.setCigar(newCigar); + + int newClippedStart = getNewAlignmentStartOffset(newCigar, oldCigar); + int newStart = clippedRead.getAlignmentStart() + newClippedStart; + clippedRead.setAlignmentStart(newStart); + + if ( algorithm == ClippingRepresentation.HARDCLIP_BASES ) + clippedRead = ReadUtils.hardClipSoftClippedBases(clippedRead); + //System.out.printf("%s clipping at %d %d / %d %d => %s and %d%n", oldCigar.toString(), start, stop, scLeft, scRight, newCigar.toString(), newStart); + break; + default: throw new IllegalStateException("Unexpected Clipping operator type " + algorithm); } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 448fc828f..16a02abdc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -618,8 +618,8 @@ public class ReadUtils { (read.getUnclippedStart() > interval.getStop()) ) return ReadAndIntervalOverlap.NO_OVERLAP; - else if ( (read.getUnclippedStart() > interval.getStart()) && - (read.getUnclippedEnd() < interval.getStop()) ) + else if ( (read.getUnclippedStart() >= interval.getStart()) && + (read.getUnclippedEnd() <= interval.getStop()) ) return ReadAndIntervalOverlap.CONTAINED; else if ( (read.getUnclippedStart() < interval.getStart()) && From 8a5173204983d86569486fe987b16480944b8f87 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sun, 14 Aug 2011 14:34:29 -0400 Subject: [PATCH 06/17] Fixes to ReadClipper and added Reference Coordinate clipping. * Added reference coordinate based hard clipping functions. This allows you to set a hard cut on where you need the read to be trimmed despite indels. * soft clipping was messing up cigar string if there was already a hard clip at the beginning of the read. Fixed. * hard clipping now works with previously hard clipped reads. --- .../sting/utils/clipreads/ClippingOp.java | 22 +------ .../sting/utils/clipreads/ReadClipper.java | 21 ++++++ .../sting/utils/sam/ReadUtils.java | 65 +++++++++++++++++-- 3 files changed, 83 insertions(+), 25 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java index 5789b606a..7d351686f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ClippingOp.java @@ -4,6 +4,7 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ReadUtils; @@ -17,22 +18,14 @@ import java.util.Vector; * according to the wishes of the supplid ClippingAlgorithm enum. */ public class ClippingOp { - public final ClippingType type; public final int start, stop; // inclusive - public final Object extraInfo; public ClippingOp(int start, int stop) { - this(null, start, stop, null); - } - - public ClippingOp(ClippingType type, int start, int stop, Object extraInfo) { - // todo -- remove type and extra info - this.type = type; this.start = start; this.stop = stop; - this.extraInfo = extraInfo; } + public int getLength() { return stop - start + 1; } @@ -119,15 +112,6 @@ public class ClippingOp { return clippedRead; } - /** - * What is the type of a ClippingOp? - */ - public enum ClippingType { - LOW_Q_SCORES, - WITHIN_CLIP_RANGE, - MATCHES_CLIP_SEQ - } - /** * Given a cigar string, get the number of bases hard or soft clipped at the start */ @@ -196,7 +180,7 @@ public class ClippingOp { Vector newElements = new Vector(); for (CigarElement curElem : __cigar.getCigarElements()) { if (!curElem.getOperator().consumesReadBases()) { - if (curLength > __startClipEnd && curLength < __endClipBegin) { + if (curElem.getOperator() == CigarOperator.HARD_CLIP || curLength > __startClipEnd && curLength < __endClipBegin) { newElements.add(new CigarElement(curElem.getLength(), curElem.getOperator())); } continue; diff --git a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java index 988d297f6..1307f1b6f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipreads/ReadClipper.java @@ -1,6 +1,10 @@ package org.broadinstitute.sting.utils.clipreads; +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.jets3t.service.multi.ThreadedStorageService; import java.util.ArrayList; import java.util.List; @@ -43,6 +47,23 @@ public class ReadClipper { return read; } + public SAMRecord hardClipByReferenceCoordinates(int refStart, int refStop) { + int start = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStart); + int stop = ReadUtils.getReadCoordinateForReferenceCoordinate(read, refStop); + this.addOp(new ClippingOp(start, stop)); + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + + public SAMRecord hardClipByReadCoordinates(int start, int stop) { + this.addOp(new ClippingOp(start, stop)); + return clipRead(ClippingRepresentation.HARDCLIP_BASES); + } + + public SAMRecord hardClipBothEndsByReferenceCoordinates(int left, int right) { + this.read = hardClipByReferenceCoordinates(read.getUnclippedStart(), left); + this.ops = null; // reset the operations + return hardClipByReferenceCoordinates(right, read.getUnclippedEnd()); + } /** * Return a new read corresponding to this.read that's been clipped according to ops, if any are present. diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 16a02abdc..3d803804a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -432,16 +432,34 @@ public class ReadUtils { keepEnd = rec.getReadLength() - l - 1; newCigarElements.add(new CigarElement(l, CigarOperator.HARD_CLIP)); break; - case H: - // TODO -- must be handled specially - throw new ReviewedStingException("BUG: tell mark he forgot to implement this"); + default: newCigarElements.add(ce); break; } } - return hardClipBases(rec, keepStart, keepEnd, newCigarElements); + // Merges tandem cigar elements like 5H10H or 2S5S to 15H or 7S + // this will happen if you soft clip a read that has been hard clipped before + // like: 5H20S => 5H20H + List mergedCigarElements = new LinkedList(); + Iterator cigarElementIterator = newCigarElements.iterator(); + CigarOperator currentOperator = null; + int currentOperatorLength = 0; + while (cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + if (currentOperator != cigarElement.getOperator()) { + if (currentOperator != null) + mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); + currentOperator = cigarElement.getOperator(); + currentOperatorLength = cigarElement.getLength(); + } + else + currentOperatorLength += cigarElement.getLength(); + } + mergedCigarElements.add(new CigarElement(currentOperatorLength, currentOperator)); + + return hardClipBases(rec, keepStart, keepEnd, mergedCigarElements); } /** @@ -460,8 +478,7 @@ public class ReadUtils { "keepEnd < rec.getReadLength()", "rec.getReadUnmappedFlag() || newCigarElements != null"}) @Ensures("result != null") - public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, - List newCigarElements) { + public static SAMRecord hardClipBases(SAMRecord rec, int keepStart, int keepEnd, List newCigarElements) { int newLength = keepEnd - keepStart + 1; if ( newLength != rec.getReadLength() ) { try { @@ -633,7 +650,43 @@ public class ReadUtils { return ReadAndIntervalOverlap.RIGHT_OVERLAP; } + @Requires({"refCoord >= read.getUnclippedStart()", "refCoord <= read.getUnclippedEnd()"}) + @Ensures({"result >= 0", "result < read.getReadLength()"}) + public static int getReadCoordinateForReferenceCoordinate(SAMRecord read, int refCoord) { + int readBases = 0; + int refBases = 0; + int goal = refCoord - read.getUnclippedStart(); // read coords are 0-based! + boolean goalReached = false; + Iterator cigarElementIterator = read.getCigar().getCigarElements().iterator(); + while (!goalReached && cigarElementIterator.hasNext()) { + CigarElement cigarElement = cigarElementIterator.next(); + int shift = 0; + if (refBases == 0 && readBases == 0 && cigarElement.getOperator() == CigarOperator.HARD_CLIP) { + goal -= cigarElement.getLength(); + } + + if (cigarElement.getOperator().consumesReferenceBases()) { + if (refBases + cigarElement.getLength() < goal) { + shift = cigarElement.getLength(); + } + else { + shift = goal - refBases; + } + refBases += shift; + } + goalReached = refBases == goal; + + if (cigarElement.getOperator().consumesReadBases()) { + readBases += goalReached ? shift : cigarElement.getLength(); + } + } + + if (!goalReached) + throw new ReviewedStingException("Somehow the requested coordinate is not covered by the read. Too many deletions?"); + + return readBases; + } } From 6ae3f9e322772c939f7c494fe232adf465bd5f77 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sun, 14 Aug 2011 15:44:48 -0400 Subject: [PATCH 07/17] Wrapped clipping op information The clipping op extra information being kept by this walker was specific to the walker, not to the read clipper. Created a wrapper ReadClipperWithData class that keeps the extra information and leaves the ReadClipper slim. (this is a quick commit to unbreak the build, performing integration tests and will make further commits if necessary) --- .../sting/gatk/walkers/ClipReadsWalker.java | 76 ++++++++++++------- 1 file changed, 50 insertions(+), 26 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index ca4e3f5e3..fc9c3a01e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -54,7 +54,7 @@ import java.util.regex.Pattern; * with poor quality scores, that match particular sequences, or that were generated by particular machine cycles. */ @Requires({DataSource.READS}) -public class ClipReadsWalker extends ReadWalker { +public class ClipReadsWalker extends ReadWalker { @Output PrintStream out; @@ -99,6 +99,22 @@ public class ClipReadsWalker extends ReadWalker> cyclesToClip = null; + public class ReadClipperWithData extends ReadClipper { + private ClippingData data; + + public ReadClipperWithData(SAMRecord read) { + super(read); + } + + public ClippingData getData() { + return data; + } + + public void setData(ClippingData data) { + this.data = data; + } + } + /** * The initialize function. */ @@ -180,12 +196,12 @@ public class ClipReadsWalker extends ReadWalker Date: Sun, 14 Aug 2011 16:38:20 -0400 Subject: [PATCH 08/17] Fixed integration tests --- .../sting/gatk/walkers/ClipReadsWalker.java | 63 ++++++++++++------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java index fc9c3a01e..76b0276cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ClipReadsWalker.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.clipreads.ClippingRepresentation; import org.broadinstitute.sting.utils.clipreads.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.sam.ReadUtils; +import org.yaml.snakeyaml.events.SequenceStartEvent; import java.io.File; import java.io.PrintStream; @@ -99,22 +100,6 @@ public class ClipReadsWalker extends ReadWalker> cyclesToClip = null; - public class ReadClipperWithData extends ReadClipper { - private ClippingData data; - - public ReadClipperWithData(SAMRecord read) { - super(read); - } - - public ClippingData getData() { - return data; - } - - public void setData(ClippingData data) { - this.data = data; - } - } - /** * The initialize function. */ @@ -201,7 +186,7 @@ public class ClipReadsWalker extends ReadWalker p : cyclesToClip) { // iterate over each cycle range int cycleStart = p.first; @@ -293,11 +279,10 @@ public class ClipReadsWalker extends ReadWalker clipSeqs) { + super(read); + data = new ClippingData(clipSeqs); + } + + public ClippingData getData() { + return data; + } + + public void setData(ClippingData data) { + this.data = data; + } + + public void addData(ClippingData data) { + this.data.addData(data); + } + } + + } \ No newline at end of file From 9ddbfdcb9fbd0c519ed801c6d759d48fab447301 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 15 Aug 2011 12:25:23 -0400 Subject: [PATCH 10/17] Check filtered status before applying to alt reference --- .../sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index efc101618..17426d4c1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -61,6 +61,8 @@ public class FastaAlternateReferenceWalker extends FastaReferenceWalker { // Check to see if we have a called snp for ( VariantContext vc : vcs ) { + if ( vc.isFiltered() ) + continue; if ( !vc.getSource().startsWith("snpmask") ) { if ( vc.isDeletion()) { deletionBasesRemaining = vc.getReference().length(); From fc2c21433b6af1c98ab89c4ce18ba9450bd43d2c Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 15 Aug 2011 13:29:31 -0400 Subject: [PATCH 11/17] Updating random walkers to new rod system --- .../sting/gatk/walkers/qc/CountIntervals.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java index 640cff2ba..29b649afe 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountIntervals.java @@ -2,16 +2,18 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import java.io.PrintStream; +import java.util.Collections; import java.util.List; /** @@ -23,6 +25,9 @@ public class CountIntervals extends RefWalker { @Output PrintStream out; + @Input(fullName="check", shortName = "check", doc="Any number of RODs", required=false) + public List> features = Collections.emptyList(); + @Argument(fullName="numOverlaps",shortName="no",doc="Count all occurrences of X or more overlapping intervals; defaults to 2", required=false) int numOverlaps = 2; @@ -37,7 +42,7 @@ public class CountIntervals extends RefWalker { return null; } - List checkIntervals = tracker.getValues(Feature.class, "check"); + List checkIntervals = tracker.getValues(features); return (long) checkIntervals.size(); } From 045e8a045eb7aa750c885e80d34a26fa2c634a5a Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 15 Aug 2011 14:05:23 -0400 Subject: [PATCH 12/17] Updating random walkers to new rod system; removing unused GenotypeAndValidateWalker --- .../broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index b9aaf47de..286e22369 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -333,10 +333,6 @@ public class RefMetaDataTracker { return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); } @Deprecated - public List getValues(final Class type, final Collection names, final GenomeLoc onlyAtThisLoc) { - return addValues(names, type, new ArrayList(), onlyAtThisLoc, true, false); - } - @Deprecated public T getFirstValue(final Class type, final String name) { return safeGetFirst(getValues(type, name)); } From 1246b8904908d3e05329032be7263d0408ab76d9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Mon, 15 Aug 2011 16:00:43 -0400 Subject: [PATCH 13/17] Forgot to initialize variants on the merge --- .../gatk/walkers/fasta/FastaAlternateReferenceWalker.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java index 365d3c2c9..a57fabc39 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceWalker.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.util.Collections; import java.util.List; @@ -49,7 +50,7 @@ import java.util.List; public class FastaAlternateReferenceWalker extends FastaReferenceWalker { @Input(fullName = "variant", shortName = "V", doc="variants to model", required=false) - public List> variants; + public List> variants = Collections.emptyList(); @Input(fullName="snpmask", shortName = "snpmask", doc="SNP mask VCF file", required=false) public RodBinding snpmask; From cf3e826a6978fcd50dfd3410a22ebf304e511e2f Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Fri, 29 Jul 2011 15:53:56 -0400 Subject: [PATCH 14/17] 1) RFCombine switched to the new ROD system 2) TreeReduce added to useful RODWalkers, but doesn't help very much due to scaling problems 3) RFA refactored, and a genotype-free calculation model added to calculate skew in a genotype-free way (still needs generalization to any ploidy) 4) Added walker to genotype intron loss events, calls into the UG engine to do so. This is very much a first-pass walker. 5) Documentation added for ValidationAmplicons --- .../filters/VariantFiltrationWalker.java | 9 +- .../genotyper/ExactAFCalculationModel.java | 69 ++++++++- .../validation/ValidationAmplicons.java | 72 ++++++++-- .../walkers/variantutils/CombineVariants.java | 6 +- .../broadinstitute/sting/utils/MathUtils.java | 135 +++++++++++++++++- 5 files changed, 279 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index c555e88cd..e34fa772b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -28,6 +28,9 @@ package org.broadinstitute.sting.gatk.walkers.filters; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -47,7 +50,7 @@ import java.util.*; * Filters variant calls using a number of user-selectable, parameterizable criteria. */ @Reference(window=@Window(start=-50,stop=50)) -public class VariantFiltrationWalker extends RodWalker { +public class VariantFiltrationWalker extends RodWalker implements TreeReducible { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -278,6 +281,10 @@ public class VariantFiltrationWalker extends RodWalker { return sum + value; } + public Integer treeReduce(Integer left, Integer right) { + return left + right; + } + /** * Tell the user the number of loci processed and close out the new variants file. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index cd006a3cf..fa4863330 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import sun.reflect.generics.reflectiveObjects.NotImplementedException; @@ -580,7 +581,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // TODO -- remove me for clarity in this code // // ------------------------------------------------------------------------------------- - public int gdaN2GoldStandard(Map GLs, + public static int gdaN2GoldStandard(Map GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { int numSamples = GLs.size(); @@ -658,4 +659,70 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } + // todo -- generalize and merge into gdaN2GoldStandard + public static int rfaN2GoldStandard(Map GLs, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { + int numSamples = GLs.size(); + int numChr = 2*numSamples; + + double[][] logYMatrix = new double[1+numSamples][1+numChr]; + + for (int i=0; i <=numSamples; i++) + for (int j=0; j <=numChr; j++) + logYMatrix[i][j] = Double.NEGATIVE_INFINITY; + + //YMatrix[0][0] = 1.0; + logYMatrix[0][0] = 0.0; + int j=0; + + for ( Map.Entry sample : GLs.entrySet() ) { + j++; + + //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); + double[] genotypeLikelihoods = sample.getValue().getAsVector(); + //double logDenominator = Math.log10(2.0*j*(2.0*j-1)); + double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + + // special treatment for k=0: iteration reduces to: + //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; + logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA]; + + for (int k=1; k <= 2*j; k++ ) { + + //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; + double logNumerator[]; + logNumerator = new double[3]; + if (k < 2*j-1) + logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] + + genotypeLikelihoods[idxAA]; + else + logNumerator[0] = Double.NEGATIVE_INFINITY; + + + if (k < 2*j) + logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + + genotypeLikelihoods[idxAB]; + else + logNumerator[1] = Double.NEGATIVE_INFINITY; + + if (k > 1) + logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] + + genotypeLikelihoods[idxBB]; + else + logNumerator[2] = Double.NEGATIVE_INFINITY; + + double logNum = MathUtils.softMax(logNumerator); + + //YMatrix[j][k] = num/den; + logYMatrix[j][k] = logNum - logDenominator; + } + + } + + for (int k=0; k <= numChr; k++) + log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; + + return numChr; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 7653f511f..82c5fc593 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -31,21 +31,77 @@ import java.util.LinkedList; import java.util.List; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 6/13/11 - * Time: 2:12 PM - * To change this template use File | Settings | File Templates. + * Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation + * + *

+ * ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe + * sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within + * the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies + * reasons why the site may fail validation (nearby variation, for example). + *

+ * + *

Input

+ *

+ * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an + * interval list defining the size of the amplicons around the sites to be validated + *

+ * + *

Output

+ *

+ * Output is a FASTA-formatted file with some modifications at probe sites. For instance: + * + * >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414 + * CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC + * >20:792122 Valid 20_792122 + * TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT + * >20:994145 Valid 20_994145 + * TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC + * >20:1074230 SITE_IS_FILTERED=1, 20_1074230 + * ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC + * >20:1084330 DELETION=1, 20_1084330 + * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC + * + * are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: + * + * Valid // amplicon is valid + * SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant") + * VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak + * MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon + * DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate + * DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak + * START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer + * END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer + * NO_VARIANTS_FOUND, // no variants found within the amplicon region + * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself) + *

+ * + *

Examples

+ * PRE-TAG + * java + * -jar GenomeAnalysisTK.jar + * -T ValidationAmplicons + * -R /humgen/1kg/reference/human_g1k_v37.fasta + * -BTI ProbeIntervals + * -ProbeIntervals:table interval_table.table + * -ValidateAlleles:vcf sites_to_validate.vcf + * -MaskAlleles:vcf mask_sites.vcf + * --virtualPrimerSize 30 + * -o probes.fasta + * PRE-TAG + * + * @author chartl + * @since July 2011 */ @Requires(value={DataSource.REFERENCE}) public class ValidationAmplicons extends RodWalker { - @Input(fullName = "ProbeIntervals", doc="Chris document me", required=true) + @Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+ + "intervals surrounding the probe sites amplicons should be designed for", required=true) RodBinding probeIntervals; - @Input(fullName = "ValidateAlleles", doc="Chris document me", required=true) + @Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true) RodBinding validateAlleles; - @Input(fullName = "MaskAlleles", doc="Chris document me", required=true) + @Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true) RodBinding maskAlleles; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index fb172e1b7..d46387084 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -31,6 +31,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.Reference; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.SampleUtils; @@ -49,7 +51,7 @@ import java.util.*; * priority list (if provided), emits a single record instance at every position represented in the rods. */ @Reference(window=@Window(start=-50,stop=50)) -public class CombineVariants extends RodWalker { +public class CombineVariants extends RodWalker implements TreeReducible{ /** * The VCF files to merge together * @@ -210,6 +212,8 @@ public class CombineVariants extends RodWalker { return 0; } + public Integer treeReduce(Integer left, Integer right) { return left + right; } + public Integer reduce(Integer counter, Integer sum) { return counter + sum; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index e197bb973..8c35119dd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -25,10 +25,13 @@ package org.broadinstitute.sting.utils; +import cern.jet.random.ChiSquare; +import cern.jet.math.Arithmetic; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.apache.lucene.messages.NLS; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.math.BigDecimal; @@ -893,6 +896,7 @@ public class MathUtils { return orderStatisticSearch((int) Math.ceil(list.size()/2), list); } + /* public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { // version of the order statistic calculator for SAMRecord/Integer lists, where the // list index maps to a q-score only through the offset index @@ -937,7 +941,7 @@ public class MathUtils { public static byte getQScoreMedian(List reads, List offsets) { return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); - } + }*/ /** A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that @@ -1355,4 +1359,133 @@ public class MathUtils { public static double log10Factorial (int x) { return log10Gamma(x+1); } + + /** + * Computes the p-value for the null hypothesis that the rows of the table are i.i.d. using a pearson chi-square test + * @param contingencyTable - a contingency table + * @return - the ensuing p-value (via chi-square) + */ + public static double contingencyChiSquare(short[][] contingencyTable ) { + int[] rowSum = new int[contingencyTable.length]; + int[] colSum = new int[contingencyTable[0].length]; + int total = 0; + for ( int i = 0; i < contingencyTable.length; i++ ) { + for ( int j = 0; j < contingencyTable[0].length; j++ ) { + rowSum[i] += contingencyTable[i][j]; + colSum[j] += contingencyTable[i][j]; + total += contingencyTable[i][j]; + } + } + + double chi = 0; + for ( int i = 0; i < contingencyTable.length; i ++ ) { + for ( int j = 0; j < contingencyTable[0].length; j++ ) { + double expected = (((double) colSum[j])/total)*rowSum[i]; + double resid = contingencyTable[i][j] - expected; + chi += resid*resid/expected; + } + } + + return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); + } + + /** + * Exactly the same as above, but using int arrays rather than short arrays on input + * @param contingencyTable + * @return + */ + public static double contingencyChiSquare(int[][] contingencyTable ) { + int[] rowSum = new int[contingencyTable.length]; + int[] colSum = new int[contingencyTable[0].length]; + int total = 0; + for ( int i = 0; i < contingencyTable.length; i++ ) { + for ( int j = 0; j < contingencyTable[0].length; j++ ) { + rowSum[i] += contingencyTable[i][j]; + colSum[j] += contingencyTable[i][j]; + total += contingencyTable[i][j]; + } + } + + double chi = 0; + for ( int i = 0; i < contingencyTable.length; i ++ ) { + for ( int j = 0; j < contingencyTable[0].length; j++ ) { + double expected = (((double) colSum[j])/total)*rowSum[i]; + double resid = contingencyTable[i][j] - expected; + chi += resid*resid/expected; + } + } + + return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); + } + +======= + public static double marginalizedFisherExact(double[] spectrum1, double[] spectrum2, int ns1, int ns2) { + int N = ns1 + ns2; + int[] rowSums = { ns1, ns2 }; + double logP = Double.NEGATIVE_INFINITY; + // todo -- sorting and strobing should chage this n^2 loop to a nlog(n) algorithm + for ( int ac1 = 0; ac1 < spectrum1.length; ac1++ ) { + for ( int ac2 = 0; ac2 < spectrum2.length; ac2++ ) { + double logPTable = spectrum1[ac1] + spectrum2[ac2]; + int[][] table = { + { ac1, ns1-ac1 }, + { ac2, ns2-ac2 } + }; + int[] colSums = { ac1 + ac2, N-ac1-ac2}; + double logPH0 = Math.log10(fisherExact(table,rowSums,colSums,N)); + logP = log10sumLog10(new double[]{logP,logPTable+logPH0}); + } + } + + return Math.pow(10,logP); + } + + /** + * Calculates the p-value for a fisher exact test for a 2x2 contingency table + */ + public static double fisherExact(int[][] table) { + if ( table.length > 2 || table[0].length > 2 ) { + throw new ReviewedStingException("Fisher exact is only implemented for 2x2 contingency tables"); + } + + int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; + int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; + int N = rowSums[0] + rowSums[1]; + + return fisherExact(table,rowSums,colSums,N); + + } + + public static double fisherExact(int[][] table, int[] rowSums, int[] colSums, int N ) { + + // calculate in log space so we don't die with high numbers + double pCutoff = Arithmetic.logFactorial(rowSums[0]) + + Arithmetic.logFactorial(rowSums[1]) + + Arithmetic.logFactorial(colSums[0]) + + Arithmetic.logFactorial(colSums[1]) + - Arithmetic.logFactorial(table[0][0]) + - Arithmetic.logFactorial(table[0][1]) + - Arithmetic.logFactorial(table[1][0]) + - Arithmetic.logFactorial(table[1][1]) + - Arithmetic.logFactorial(N); + return Math.exp(pCutoff); + } + + public static int sumRow(int[][] table, int column) { + int sum = 0; + for (int r = 0; r < table.length; r++) { + sum += table[r][column]; + } + + return sum; + } + + public static int sumColumn(int[][] table, int row) { + int sum = 0; + for (int c = 0; c < table[row].length; c++) { + sum += table[row][c]; + } + + return sum; + } } From 5aa61fefec8442de1826343154e4f83c89704ede Mon Sep 17 00:00:00 2001 From: Christopher Hartl Date: Mon, 15 Aug 2011 16:53:05 -0400 Subject: [PATCH 15/17] Remove merge-added ======'s so this compiles --- public/java/src/org/broadinstitute/sting/utils/MathUtils.java | 1 - 1 file changed, 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 8c35119dd..75bb0151a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1418,7 +1418,6 @@ public class MathUtils { return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); } -======= public static double marginalizedFisherExact(double[] spectrum1, double[] spectrum2, int ns1, int ns2) { int N = ns1 + ns2; int[] rowSums = { ns1, ns2 }; From 1968b65ca86f6a5bf5142049366462aa86a0768c Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 15 Aug 2011 18:45:21 -0400 Subject: [PATCH 16/17] Revert "Remove merge-added ======'s so this compiles" This reverts commit be028b6513a129f81aa6f3593ea7d396c0e8fc25. --- public/java/src/org/broadinstitute/sting/utils/MathUtils.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 75bb0151a..8c35119dd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -1418,6 +1418,7 @@ public class MathUtils { return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); } +======= public static double marginalizedFisherExact(double[] spectrum1, double[] spectrum2, int ns1, int ns2) { int N = ns1 + ns2; int[] rowSums = { ns1, ns2 }; From 3e9ef0622de51e5ffbfb88e0be2a6825f33d43f4 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Mon, 15 Aug 2011 18:45:38 -0400 Subject: [PATCH 17/17] Revert "1) RFCombine switched to the new ROD system" This reverts commit cf989bd3cfae119ba9011873c5f5d5b80e37f67b. --- .../filters/VariantFiltrationWalker.java | 9 +- .../genotyper/ExactAFCalculationModel.java | 69 +-------- .../validation/ValidationAmplicons.java | 72 ++-------- .../walkers/variantutils/CombineVariants.java | 6 +- .../broadinstitute/sting/utils/MathUtils.java | 135 +----------------- 5 files changed, 12 insertions(+), 279 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java index e34fa772b..c555e88cd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationWalker.java @@ -28,9 +28,6 @@ package org.broadinstitute.sting.gatk.walkers.filters; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -50,7 +47,7 @@ import java.util.*; * Filters variant calls using a number of user-selectable, parameterizable criteria. */ @Reference(window=@Window(start=-50,stop=50)) -public class VariantFiltrationWalker extends RodWalker implements TreeReducible { +public class VariantFiltrationWalker extends RodWalker { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -281,10 +278,6 @@ public class VariantFiltrationWalker extends RodWalker impleme return sum + value; } - public Integer treeReduce(Integer left, Integer right) { - return left + right; - } - /** * Tell the user the number of loci processed and close out the new variants file. * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java index fa4863330..cd006a3cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java @@ -34,7 +34,6 @@ import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import sun.reflect.generics.reflectiveObjects.NotImplementedException; @@ -581,7 +580,7 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { // TODO -- remove me for clarity in this code // // ------------------------------------------------------------------------------------- - public static int gdaN2GoldStandard(Map GLs, + public int gdaN2GoldStandard(Map GLs, double[] log10AlleleFrequencyPriors, double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { int numSamples = GLs.size(); @@ -659,70 +658,4 @@ public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { } } - // todo -- generalize and merge into gdaN2GoldStandard - public static int rfaN2GoldStandard(Map GLs, - double[] log10AlleleFrequencyPriors, - double[] log10AlleleFrequencyPosteriors, int idxAA, int idxAB, int idxBB) { - int numSamples = GLs.size(); - int numChr = 2*numSamples; - - double[][] logYMatrix = new double[1+numSamples][1+numChr]; - - for (int i=0; i <=numSamples; i++) - for (int j=0; j <=numChr; j++) - logYMatrix[i][j] = Double.NEGATIVE_INFINITY; - - //YMatrix[0][0] = 1.0; - logYMatrix[0][0] = 0.0; - int j=0; - - for ( Map.Entry sample : GLs.entrySet() ) { - j++; - - //double[] genotypeLikelihoods = MathUtils.normalizeFromLog10(GLs.get(sample).getLikelihoods()); - double[] genotypeLikelihoods = sample.getValue().getAsVector(); - //double logDenominator = Math.log10(2.0*j*(2.0*j-1)); - double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - - // special treatment for k=0: iteration reduces to: - //YMatrix[j][0] = YMatrix[j-1][0]*genotypeLikelihoods[GenotypeType.AA.ordinal()]; - logYMatrix[j][0] = logYMatrix[j-1][0] + genotypeLikelihoods[idxAA]; - - for (int k=1; k <= 2*j; k++ ) { - - //double num = (2.0*j-k)*(2.0*j-k-1)*YMatrix[j-1][k] * genotypeLikelihoods[GenotypeType.AA.ordinal()]; - double logNumerator[]; - logNumerator = new double[3]; - if (k < 2*j-1) - logNumerator[0] = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + logYMatrix[j-1][k] + - genotypeLikelihoods[idxAA]; - else - logNumerator[0] = Double.NEGATIVE_INFINITY; - - - if (k < 2*j) - logNumerator[1] = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ logYMatrix[j-1][k-1] + - genotypeLikelihoods[idxAB]; - else - logNumerator[1] = Double.NEGATIVE_INFINITY; - - if (k > 1) - logNumerator[2] = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + logYMatrix[j-1][k-2] + - genotypeLikelihoods[idxBB]; - else - logNumerator[2] = Double.NEGATIVE_INFINITY; - - double logNum = MathUtils.softMax(logNumerator); - - //YMatrix[j][k] = num/den; - logYMatrix[j][k] = logNum - logDenominator; - } - - } - - for (int k=0; k <= numChr; k++) - log10AlleleFrequencyPosteriors[k] = logYMatrix[j][k] + log10AlleleFrequencyPriors[k]; - - return numChr; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index 82c5fc593..7653f511f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -31,77 +31,21 @@ import java.util.LinkedList; import java.util.List; /** - * Creates FASTA sequences for use in Seqenom or PCR utilities for site amplification and subsequent validation - * - *

- * ValidationAmplicons consumes a VCF and an Interval list and produces FASTA sequences from which PCR primers or probe - * sequences can be designed. In addition, ValidationAmplicons uses BWA to check for specificity of tracts of bases within - * the output amplicon, lower-casing non-specific tracts, allows for users to provide sites to mask out, and specifies - * reasons why the site may fail validation (nearby variation, for example). - *

- * - *

Input

- *

- * Requires a VCF containing alleles to design amplicons towards, a VCF of variants to mask out of the amplicons, and an - * interval list defining the size of the amplicons around the sites to be validated - *

- * - *

Output

- *

- * Output is a FASTA-formatted file with some modifications at probe sites. For instance: - * - * >20:207414 INSERTION=1,VARIANT_TOO_NEAR_PROBE=1, 20_207414 - * CCAACGTTAAGAAAGAGACATGCGACTGGGTgcggtggctcatgcctggaaccccagcactttgggaggccaaggtgggc[A/G*]gNNcacttgaggtcaggagtttgagaccagcctggccaacatggtgaaaccccgtctctactgaaaatacaaaagttagC - * >20:792122 Valid 20_792122 - * TTTTTTTTTagatggagtctcgctcttatcgcccaggcNggagtgggtggtgtgatcttggctNactgcaacttctgcct[-/CCC*]cccaggttcaagtgattNtcctgcctcagccacctgagtagctgggattacaggcatccgccaccatgcctggctaatTT - * >20:994145 Valid 20_994145 - * TCCATGGCCTCCCCCTGGCCCACGAAGTCCTCAGCCACCTCCTTCCTGGAGGGCTCAGCCAAAATCAGACTGAGGAAGAAG[AAG/-*]TGGTGGGCACCCACCTTCTGGCCTTCCTCAGCCCCTTATTCCTAGGACCAGTCCCCATCTAGGGGTCCTCACTGCCTCCC - * >20:1074230 SITE_IS_FILTERED=1, 20_1074230 - * ACCTGATTACCATCAATCAGAACTCATTTCTGTTCCTATCTTCCACCCACAATTGTAATGCCTTTTCCATTTTAACCAAG[T/C*]ACTTATTATAtactatggccataacttttgcagtttgaggtatgacagcaaaaTTAGCATACATTTCATTTTCCTTCTTC - * >20:1084330 DELETION=1, 20_1084330 - * CACGTTCGGcttgtgcagagcctcaaggtcatccagaggtgatAGTTTAGGGCCCTCTCAAGTCTTTCCNGTGCGCATGG[GT/AC*]CAGCCCTGGGCACCTGTNNNNNNNNNNNNNTGCTCATGGCCTTCTAGATTCCCAGGAAATGTCAGAGCTTTTCAAAGCCC - * - * are amplicon sequences resulting from running the tool. The flags (preceding the sequence itself) can be: - * - * Valid // amplicon is valid - * SITE_IS_FILTERED=1 // validation site is not marked 'PASS' or '.' in its filter field ("you are trying to validate a filtered variant") - * VARIANT_TOO_NEAR_PROBE=1 // there is a variant too near to the variant to be validated, potentially shifting the mass-spec peak - * MULTIPLE_PROBES=1, // multiple variants to be validated found inside the same amplicon - * DELETION=6,INSERTION=5, // 6 deletions and 5 insertions found inside the amplicon region (from the "mask" VCF), will be potentially difficult to validate - * DELETION=1, // deletion found inside the amplicon region, could shift mass-spec peak - * START_TOO_CLOSE, // variant is too close to the start of the amplicon region to give sequenom a good chance to find a suitable primer - * END_TOO_CLOSE, // variant is too close to the end of the amplicon region to give sequenom a good chance to find a suitable primer - * NO_VARIANTS_FOUND, // no variants found within the amplicon region - * INDEL_OVERLAPS_VALIDATION_SITE, // an insertion or deletion interferes directly with the site to be validated (i.e. insertion directly preceding or postceding, or a deletion that spans the site itself) - *

- * - *

Examples

- * PRE-TAG - * java - * -jar GenomeAnalysisTK.jar - * -T ValidationAmplicons - * -R /humgen/1kg/reference/human_g1k_v37.fasta - * -BTI ProbeIntervals - * -ProbeIntervals:table interval_table.table - * -ValidateAlleles:vcf sites_to_validate.vcf - * -MaskAlleles:vcf mask_sites.vcf - * --virtualPrimerSize 30 - * -o probes.fasta - * PRE-TAG - * - * @author chartl - * @since July 2011 + * Created by IntelliJ IDEA. + * User: chartl + * Date: 6/13/11 + * Time: 2:12 PM + * To change this template use File | Settings | File Templates. */ @Requires(value={DataSource.REFERENCE}) public class ValidationAmplicons extends RodWalker { - @Input(fullName = "ProbeIntervals", doc="A collection of intervals in table format with optional names that represent the "+ - "intervals surrounding the probe sites amplicons should be designed for", required=true) + @Input(fullName = "ProbeIntervals", doc="Chris document me", required=true) RodBinding probeIntervals; - @Input(fullName = "ValidateAlleles", doc="A VCF containing the sites and alleles you want to validate. Restricted to *BI-Allelic* sites", required=true) + @Input(fullName = "ValidateAlleles", doc="Chris document me", required=true) RodBinding validateAlleles; - @Input(fullName = "MaskAlleles", doc="A VCF containing the sites you want to MASK from the designed amplicon (e.g. by Ns or lower-cased bases)", required=true) + @Input(fullName = "MaskAlleles", doc="Chris document me", required=true) RodBinding maskAlleles; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index d46387084..fb172e1b7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -31,8 +31,6 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.Reference; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; import org.broadinstitute.sting.utils.SampleUtils; @@ -51,7 +49,7 @@ import java.util.*; * priority list (if provided), emits a single record instance at every position represented in the rods. */ @Reference(window=@Window(start=-50,stop=50)) -public class CombineVariants extends RodWalker implements TreeReducible{ +public class CombineVariants extends RodWalker { /** * The VCF files to merge together * @@ -212,8 +210,6 @@ public class CombineVariants extends RodWalker implements Tree return 0; } - public Integer treeReduce(Integer left, Integer right) { return left + right; } - public Integer reduce(Integer counter, Integer sum) { return counter + sum; } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 8c35119dd..e197bb973 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -25,13 +25,10 @@ package org.broadinstitute.sting.utils; -import cern.jet.random.ChiSquare; -import cern.jet.math.Arithmetic; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.apache.lucene.messages.NLS; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.math.BigDecimal; @@ -896,7 +893,6 @@ public class MathUtils { return orderStatisticSearch((int) Math.ceil(list.size()/2), list); } - /* public static byte getQScoreOrderStatistic(List reads, List offsets, int k) { // version of the order statistic calculator for SAMRecord/Integer lists, where the // list index maps to a q-score only through the offset index @@ -941,7 +937,7 @@ public class MathUtils { public static byte getQScoreMedian(List reads, List offsets) { return getQScoreOrderStatistic(reads, offsets, (int)Math.floor(reads.size()/2.)); - }*/ + } /** A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that @@ -1359,133 +1355,4 @@ public class MathUtils { public static double log10Factorial (int x) { return log10Gamma(x+1); } - - /** - * Computes the p-value for the null hypothesis that the rows of the table are i.i.d. using a pearson chi-square test - * @param contingencyTable - a contingency table - * @return - the ensuing p-value (via chi-square) - */ - public static double contingencyChiSquare(short[][] contingencyTable ) { - int[] rowSum = new int[contingencyTable.length]; - int[] colSum = new int[contingencyTable[0].length]; - int total = 0; - for ( int i = 0; i < contingencyTable.length; i++ ) { - for ( int j = 0; j < contingencyTable[0].length; j++ ) { - rowSum[i] += contingencyTable[i][j]; - colSum[j] += contingencyTable[i][j]; - total += contingencyTable[i][j]; - } - } - - double chi = 0; - for ( int i = 0; i < contingencyTable.length; i ++ ) { - for ( int j = 0; j < contingencyTable[0].length; j++ ) { - double expected = (((double) colSum[j])/total)*rowSum[i]; - double resid = contingencyTable[i][j] - expected; - chi += resid*resid/expected; - } - } - - return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); - } - - /** - * Exactly the same as above, but using int arrays rather than short arrays on input - * @param contingencyTable - * @return - */ - public static double contingencyChiSquare(int[][] contingencyTable ) { - int[] rowSum = new int[contingencyTable.length]; - int[] colSum = new int[contingencyTable[0].length]; - int total = 0; - for ( int i = 0; i < contingencyTable.length; i++ ) { - for ( int j = 0; j < contingencyTable[0].length; j++ ) { - rowSum[i] += contingencyTable[i][j]; - colSum[j] += contingencyTable[i][j]; - total += contingencyTable[i][j]; - } - } - - double chi = 0; - for ( int i = 0; i < contingencyTable.length; i ++ ) { - for ( int j = 0; j < contingencyTable[0].length; j++ ) { - double expected = (((double) colSum[j])/total)*rowSum[i]; - double resid = contingencyTable[i][j] - expected; - chi += resid*resid/expected; - } - } - - return 1.0-(new ChiSquare(contingencyTable.length*contingencyTable[0].length,null)).cdf(chi); - } - -======= - public static double marginalizedFisherExact(double[] spectrum1, double[] spectrum2, int ns1, int ns2) { - int N = ns1 + ns2; - int[] rowSums = { ns1, ns2 }; - double logP = Double.NEGATIVE_INFINITY; - // todo -- sorting and strobing should chage this n^2 loop to a nlog(n) algorithm - for ( int ac1 = 0; ac1 < spectrum1.length; ac1++ ) { - for ( int ac2 = 0; ac2 < spectrum2.length; ac2++ ) { - double logPTable = spectrum1[ac1] + spectrum2[ac2]; - int[][] table = { - { ac1, ns1-ac1 }, - { ac2, ns2-ac2 } - }; - int[] colSums = { ac1 + ac2, N-ac1-ac2}; - double logPH0 = Math.log10(fisherExact(table,rowSums,colSums,N)); - logP = log10sumLog10(new double[]{logP,logPTable+logPH0}); - } - } - - return Math.pow(10,logP); - } - - /** - * Calculates the p-value for a fisher exact test for a 2x2 contingency table - */ - public static double fisherExact(int[][] table) { - if ( table.length > 2 || table[0].length > 2 ) { - throw new ReviewedStingException("Fisher exact is only implemented for 2x2 contingency tables"); - } - - int[] rowSums = { sumRow(table, 0), sumRow(table, 1) }; - int[] colSums = { sumColumn(table, 0), sumColumn(table, 1) }; - int N = rowSums[0] + rowSums[1]; - - return fisherExact(table,rowSums,colSums,N); - - } - - public static double fisherExact(int[][] table, int[] rowSums, int[] colSums, int N ) { - - // calculate in log space so we don't die with high numbers - double pCutoff = Arithmetic.logFactorial(rowSums[0]) - + Arithmetic.logFactorial(rowSums[1]) - + Arithmetic.logFactorial(colSums[0]) - + Arithmetic.logFactorial(colSums[1]) - - Arithmetic.logFactorial(table[0][0]) - - Arithmetic.logFactorial(table[0][1]) - - Arithmetic.logFactorial(table[1][0]) - - Arithmetic.logFactorial(table[1][1]) - - Arithmetic.logFactorial(N); - return Math.exp(pCutoff); - } - - public static int sumRow(int[][] table, int column) { - int sum = 0; - for (int r = 0; r < table.length; r++) { - sum += table[r][column]; - } - - return sum; - } - - public static int sumColumn(int[][] table, int row) { - int sum = 0; - for (int c = 0; c < table[row].length; c++) { - sum += table[row][c]; - } - - return sum; - } }