From 390d493049f66d01b0ef925f572358a862b767e0 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 26 Jan 2012 11:37:08 -0500 Subject: [PATCH 01/22] Updating ActiveRegionWalker interface to output a probability of active status instead of a boolean. Integrator runs a band-pass filter over this probability to produce actual active regions. First version of HaplotypeCaller which decides for itself where to trigger and assembles those regions. --- .../traversals/TraverseActiveRegions.java | 90 ++++++++++--------- .../gatk/walkers/ActiveRegionWalker.java | 4 +- .../walkers/genotyper/UnifiedGenotyper.java | 2 + 3 files changed, 52 insertions(+), 44 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index f5e936a09..83daa4d80 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -10,14 +10,12 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.ArrayList; -import java.util.LinkedHashSet; -import java.util.LinkedList; -import java.util.Queue; +import java.util.*; /** * Created by IntelliJ IDEA. @@ -54,7 +52,8 @@ public class TraverseActiveRegions extends TraversalEngine isActiveList = new ArrayList(); + final ArrayList isActiveList = new ArrayList(); + GenomeLoc firstIsActiveStart = null; //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); ReferenceOrderedView referenceOrderedDataView = null; @@ -91,11 +90,15 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine activeRegions = integrateActiveList( isActiveList ); + final ArrayList activeRegions = integrateActiveList( isActiveList, firstIsActiveStart, activeRegionExtension ); logger.debug("Integrated " + isActiveList.size() + " isActive calls into " + activeRegions.size() + " regions." ); if( walker.activeRegionOutStream == null ) { workQueue.addAll( activeRegions ); @@ -137,14 +134,11 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine integrateActiveList( final ArrayList activeList ) { + // band-pass filter the list of isActive probabilities and turn into active regions + private ArrayList integrateActiveList( final ArrayList activeList, final GenomeLoc firstIsActiveStart, final int activeRegionExtension ) { + + final double ACTIVE_PROB_THRESHOLD = 0.2; final ArrayList returnList = new ArrayList(); if( activeList.size() == 0 ) { return returnList; } else if( activeList.size() == 1 ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(activeList.get(0).getLocation().getContig(), activeList.get(0).getLocation().getStart(), activeList.get(0).getLocation().getStart()), - activeList.get(0).isActive, engine.getGenomeLocParser(), activeList.get(0).getExtension() ) ); + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart(), firstIsActiveStart.getStart()), + activeList.get(0) > ACTIVE_PROB_THRESHOLD, engine.getGenomeLocParser(), activeRegionExtension ) ); return returnList; } else { - ActiveRegion prevLocus = activeList.get(0); - ActiveRegion startLocus = prevLocus; - for( final ActiveRegion thisLocus : activeList ) { - if( prevLocus.isActive != thisLocus.isActive || !prevLocus.getLocation().contiguousP( thisLocus.getLocation() ) ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), - prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); - startLocus = thisLocus; + final Double[] activeProbArray = activeList.toArray(new Double[activeList.size()]); + final double[] filteredProbArray = new double[activeProbArray.length]; + final int FILTER_SIZE = 10; + final int MAX_ACTIVE_REGION = 200; + for( int iii = 0; iii < activeProbArray.length; iii++ ) { + double maxVal = 0; + for( int jjj = Math.max( 0, iii-FILTER_SIZE); jjj < Math.min( activeList.size(), iii+FILTER_SIZE); jjj++ ) { + if( activeProbArray[jjj] > maxVal ) { maxVal = activeProbArray[jjj]; } } - prevLocus = thisLocus; + filteredProbArray[iii] = maxVal; } - // output the last region if necessary - if( startLocus != prevLocus ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(startLocus.getLocation().getContig(), startLocus.getLocation().getStart(), prevLocus.getLocation().getStart()), - prevLocus.isActive, engine.getGenomeLocParser(), startLocus.getExtension() ) ); + + boolean curStatus = filteredProbArray[0] > ACTIVE_PROB_THRESHOLD; + int curStart = 0; + for(int iii = 1; iii < filteredProbArray.length; iii++ ) { + final boolean thisStatus = filteredProbArray[iii] > ACTIVE_PROB_THRESHOLD; + if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)), + curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); + curStatus = thisStatus; + curStart = iii; + } } + returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)), + curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); return returnList; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index 98308ee11..244870c78 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -73,8 +73,8 @@ public abstract class ActiveRegionWalker extends Walker Date: Thu, 26 Jan 2012 15:56:33 -0500 Subject: [PATCH 04/22] HaplotypeCaller now uses insertions and softclipped bases as possible triggers. LocusIteratorByState tags pileup elements with the required info to make this calculation efficient. The days of the extended event pileup are coming to a close. --- .../gatk/iterators/LocusIteratorByState.java | 9 ++++- ...NPGenotypeLikelihoodsCalculationModel.java | 2 +- .../pileup/AbstractReadBackedPileup.java | 6 +-- .../pileup/ExtendedEventPileupElement.java | 4 +- .../sting/utils/pileup/PileupElement.java | 37 +++++++++---------- .../ReadBackedExtendedEventPileupImpl.java | 2 +- .../utils/pileup/ReadBackedPileupImpl.java | 4 +- .../sting/utils/sam/ArtificialSAMUtils.java | 4 +- .../sting/utils/sam/ReadUtilsUnitTest.java | 4 +- 9 files changed, 39 insertions(+), 33 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index f1ffa121b..2257cc139 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -176,6 +176,10 @@ public class LocusIteratorByState extends LocusIterator { return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement); } + public CigarOperator peekForwardOnGenome() { + return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement ).getOperator(); + } + public CigarOperator stepForwardOnGenome() { // we enter this method with readOffset = index of the last processed base on the read // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion @@ -455,6 +459,7 @@ public class LocusIteratorByState extends LocusIterator { final SAMRecordState state = iterator.next(); // state object with the read/offset information final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator + final CigarOperator nextOp = state.peekForwardOnGenome(); // next cigar operator final int readOffset = state.getReadOffset(); // the base offset on this read final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. @@ -467,13 +472,13 @@ public class LocusIteratorByState extends LocusIterator { if (op == CigarOperator.D) { if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so int leftAlignedStart = (eventStartOffset < 0) ? readOffset : eventStartOffset; - pile.add(new PileupElement(read, leftAlignedStart, true)); + pile.add(new PileupElement(read, leftAlignedStart, true, nextOp == CigarOperator.I, false)); size++; nDeletions++; } } else { if (!filterBaseInRead(read, location.getStart())) { - pile.add(new PileupElement(read, readOffset, false)); + pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.I, op == CigarOperator.S)); size++; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index d9ee2ba1b..5980ff356 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -212,7 +212,7 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC public class BAQedPileupElement extends PileupElement { public BAQedPileupElement( final PileupElement PE ) { - super(PE.getRead(), PE.getOffset(), PE.isDeletion()); + super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeInsertion(), PE.isSoftClipped()); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 1fa7101ca..f4fa9e941 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -177,7 +177,7 @@ public abstract class AbstractReadBackedPileup pileup = new UnifiedPileupElementTracker(); for (GATKSAMRecord read : reads) { - pileup.add(createNewPileupElement(read, offset, BaseUtils.simpleBaseToBaseIndex(read.getReadBases()[offset]) == BaseUtils.D)); + pileup.add(createNewPileupElement(read, offset, false, false, false)); // only used to create fake pileups for testing so ancillary information is not important } return pileup; @@ -204,7 +204,7 @@ public abstract class AbstractReadBackedPileup createNewPileup(GenomeLoc loc, PileupElementTracker pileupElementTracker); - protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion); + protected abstract PE createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isSoftClipped); // -------------------------------------------------------- // diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java index 1d7e6f636..921da2a1f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ExtendedEventPileupElement.java @@ -31,6 +31,8 @@ import java.util.Arrays; * Time: 2:57:55 PM * To change this template use File | Settings | File Templates. */ + +// Extended events are slated for removal public class ExtendedEventPileupElement extends PileupElement { public enum Type { NOEVENT, DELETION, INSERTION @@ -46,7 +48,7 @@ public class ExtendedEventPileupElement extends PileupElement { public ExtendedEventPileupElement(GATKSAMRecord read, int offset, int eventLength, String eventBases, Type type) { - super(read, offset, type == Type.DELETION); + super(read, offset, type == Type.DELETION, false, false); // extended events are slated for removal this.read = read; this.offset = offset; this.eventLength = eventLength; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 73f010d40..87aa31c47 100755 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -23,47 +23,46 @@ public class PileupElement implements Comparable { protected final GATKSAMRecord read; protected final int offset; protected final boolean isDeletion; + protected final boolean isBeforeInsertion; + protected final boolean isSoftClipped; /** * Creates a new pileup element. * - * @param read the read we are adding to the pileup - * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) - * @param isDeletion whether or not this base is a deletion + * @param read the read we are adding to the pileup + * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) + * @param isDeletion whether or not this base is a deletion + * @param isBeforeInsertion whether or not this base is before an insertion + * @param isSoftClipped whether or not this base was softclipped */ @Requires({ "read != null", "offset >= -1", "offset <= read.getReadLength()"}) - public PileupElement(GATKSAMRecord read, int offset, boolean isDeletion) { + public PileupElement(final GATKSAMRecord read, final int offset, final boolean isDeletion, final boolean isBeforeInsertion, final boolean isSoftClipped) { if (offset < 0 && isDeletion) throw new ReviewedStingException("Pileup Element cannot create a deletion with a negative offset"); this.read = read; this.offset = offset; this.isDeletion = isDeletion; + this.isBeforeInsertion = isBeforeInsertion; + this.isSoftClipped = isSoftClipped; } - // /** -// * Creates a NON DELETION pileup element. -// * -// * use this constructor only for insertions and matches/mismatches. -// * @param read the read we are adding to the pileup -// * @param offset the position in the read for this base. All deletions must be left aligned! (-1 is only allowed for reads starting with insertions) -// */ -// @Requires({ -// "read != null", -// "offset >= -1", -// "offset <= read.getReadLength()"}) -// public PileupElement( GATKSAMRecord read, int offset ) { -// this(read, offset, false); -// } -// public boolean isDeletion() { return isDeletion; } + public boolean isBeforeInsertion() { + return isBeforeInsertion; + } + + public boolean isSoftClipped() { + return isSoftClipped; + } + public boolean isInsertionAtBeginningOfRead() { return offset == -1; } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java index bf67d1a70..641c63f6c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedExtendedEventPileupImpl.java @@ -96,7 +96,7 @@ public class ReadBackedExtendedEventPileupImpl extends AbstractReadBackedPileup< } @Override - protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion) { + protected ExtendedEventPileupElement createNewPileupElement(GATKSAMRecord read, int offset, boolean isDeletion, boolean isBeforeInsertion, boolean isSoftClipped) { throw new UnsupportedOperationException("Not enough information provided to create a new pileup element"); } diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index 66ddbe95d..965e74e8b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -71,7 +71,7 @@ public class ReadBackedPileupImpl extends AbstractReadBackedPileup= right.getAlignmentStart() && pos <= right.getAlignmentEnd()) { - pileupElements.add(new PileupElement(right, pos - rightStart, false)); + pileupElements.add(new PileupElement(right, pos - rightStart, false, false, false)); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 1a8086a1b..b7a22ca1a 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -42,8 +42,8 @@ public class ReadUtilsUnitTest extends BaseTest { @Test public void testReducedReadPileupElement() { - PileupElement readp = new PileupElement(read, 0, false); - PileupElement reducedreadp = new PileupElement(reducedRead, 0, false); + PileupElement readp = new PileupElement(read, 0, false, false, false); + PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false); Assert.assertFalse(readp.getRead().isReducedRead()); From 07f72516ae299a61014b3a1fe8a1ba6c34140687 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Thu, 26 Jan 2012 16:14:25 -0500 Subject: [PATCH 05/22] Unsupported platform should be a user error --- .../sting/gatk/walkers/recalibration/CycleCovariate.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java index 6b4fec04e..b0819ee69 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/recalibration/CycleCovariate.java @@ -159,7 +159,7 @@ public class CycleCovariate implements StandardCovariate { } } else { - throw new IllegalStateException("This method hasn't been implemented yet for " + read.getReadGroup().getPlatform()); + throw new UserException("The platform (" + read.getReadGroup().getPlatform() + ") associated with read group " + read.getReadGroup() + " is not a recognized platform. Implemented options are e.g. illumina, 454, and solid"); } } From 0d4027104f2d511aacf8bc01dea00d0c5f0fb2ec Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 26 Jan 2012 16:07:29 -0500 Subject: [PATCH 06/22] Reduced reads are now aware of their original alignments * Added annotations for reads that had been soft clipped prior to being reduced so that we can later recuperate their original alignments (start and end). * Tags keep the alignment shifts, not real alignment, for better compression * Tags are defined in the GATKSAMRecord * GATKSAMRecord has new functionality to retrieve original alignment start of all reads (trimmed or not) -- getOriginalAlignmentStart() and getOriginalAligmentEnd() * Updated ReduceReads MD5s accordingly --- .../sting/utils/sam/GATKSAMRecord.java | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 913548ecc..f17772f40 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -43,7 +43,10 @@ import java.util.Map; * */ public class GATKSAMRecord extends BAMRecord { - public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; + // ReduceReads specific attribute tags + public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool + public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OS"; // reads that are clipped may use this attribute to keep track of their original alignment start + public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end // the SAMRecord data we're caching private String mReadString = null; @@ -321,6 +324,36 @@ public class GATKSAMRecord extends BAMRecord { return (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + /** + * Determines the original alignment start of a previously clipped read. + * + * This is useful for reads that have been trimmed to a variant region and lost the information of it's original alignment end + * + * @return the alignment start of a read before it was clipped + */ + public int getOriginalAlignmentStart() { + int originalAlignmentStart = getUnclippedStart(); + Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT); + if (alignmentShift != null) + originalAlignmentStart += alignmentShift; + return originalAlignmentStart; + } + + /** + * Determines the original alignment end of a previously clipped read. + * + * This is useful for reads that have been trimmed to a variant region and lost the information of it's original alignment end + * + * @return the alignment end of a read before it was clipped + */ + public int getOriginalAlignmentEnd() { + int originalAlignmentEnd = getUnclippedEnd(); + Integer alignmentShift = (Integer) getAttribute(REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT); + if (alignmentShift != null) + originalAlignmentEnd -= alignmentShift; + return originalAlignmentEnd; + } + /** * Creates an empty GATKSAMRecord with the read's header, read group and mate * information, but empty (not-null) fields: @@ -363,4 +396,21 @@ public class GATKSAMRecord extends BAMRecord { return emptyRead; } + /** + * Shallow copy of everything, except for the attribute list and the temporary attributes. + * A new list of the attributes is created for both, but the attributes themselves are copied by reference. + * This should be safe because callers should never modify a mutable value returned by any of the get() methods anyway. + * + * @return a shallow copy of the GATKSAMRecord + * @throws CloneNotSupportedException + */ + @Override + public Object clone() throws CloneNotSupportedException { + final GATKSAMRecord clone = (GATKSAMRecord) super.clone(); + if (temporaryAttributes != null) { + for (Object attribute : temporaryAttributes.keySet()) + clone.setTemporaryAttribute(attribute, temporaryAttributes.get(attribute)); + } + return clone; + } } From 246e085ec9773320843f50da47066cd8313f659c Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 26 Jan 2012 16:59:08 -0500 Subject: [PATCH 07/22] Unit tests for GATKSAMRecord class * new unit tests for the alignment shift properties of reduce reads * moved unit tests from ReadUtils that were actually testing GATKSAMRecord, not any of the ReadUtils to it. * cleaned up ReadUtilsUnitTest --- .../utils/sam/GATKSAMRecordUnitTest.java | 83 +++++++++++++++++++ .../sting/utils/sam/ReadUtilsUnitTest.java | 46 ---------- 2 files changed, 83 insertions(+), 46 deletions(-) create mode 100755 public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java new file mode 100755 index 000000000..317b320d3 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -0,0 +1,83 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +public class GATKSAMRecordUnitTest extends BaseTest { + GATKSAMRecord read, reducedRead; + final static String BASES = "ACTG"; + final static String QUALS = "!+5?"; + final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; + final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets + + @BeforeClass + public void init() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); + read.setReadUnmappedFlag(true); + read.setReadBases(new String(BASES).getBytes()); + read.setBaseQualityString(new String(QUALS)); + + reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); + reducedRead.setReadBases(BASES.getBytes()); + reducedRead.setBaseQualityString(QUALS); + reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); + } + + @Test + public void testReducedReads() { + Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); + Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); + + Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); + for (int i = 0; i < reducedRead.getReadLength(); i++) { + Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); + } + } + + @Test + public void testReducedReadPileupElement() { + PileupElement readp = new PileupElement(read, 0, false); + PileupElement reducedreadp = new PileupElement(reducedRead, 0, false); + + Assert.assertFalse(readp.getRead().isReducedRead()); + + Assert.assertTrue(reducedreadp.getRead().isReducedRead()); + Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); + Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); + } + + @Test + public void testGetOriginalAlignments() { + final byte [] bases = {'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A'}; + final byte [] quals = {20 , 20 , 20 , 20 , 20 , 20 , 20 , 20 }; + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, quals, "6M"); + + // A regular read with all matches + Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); + Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd()); + + // Alignment start shifted + int alignmentShift = 2; + read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, alignmentShift); + Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart()); + Assert.assertEquals(read.getAlignmentEnd(), read.getOriginalAlignmentEnd()); + + // Both alignments shifted + read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, alignmentShift); + Assert.assertEquals(read.getAlignmentStart() + alignmentShift, read.getOriginalAlignmentStart()); + Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); + + // Alignment end shifted + read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, null); + Assert.assertEquals(read.getAlignmentStart(), read.getOriginalAlignmentStart()); + Assert.assertEquals(read.getAlignmentEnd() - alignmentShift, read.getOriginalAlignmentEnd()); + + } + +} diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index b7a22ca1a..7598f62a6 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -1,57 +1,11 @@ package org.broadinstitute.sting.utils.sam; -import net.sf.samtools.SAMFileHeader; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.testng.Assert; -import org.testng.annotations.BeforeTest; import org.testng.annotations.Test; public class ReadUtilsUnitTest extends BaseTest { - GATKSAMRecord read, reducedRead; - final static String BASES = "ACTG"; - final static String QUALS = "!+5?"; - final private static byte[] REDUCED_READ_COUNTS = new byte[]{10, 20, 30, 40, 1}; - final private static byte[] REDUCED_READ_COUNTS_TAG = new byte[]{10, 10, 20, 30, -9}; // just the offsets - - @BeforeTest - public void init() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); - read = ArtificialSAMUtils.createArtificialRead(header, "read1", 0, 1, BASES.length()); - read.setReadUnmappedFlag(true); - read.setReadBases(new String(BASES).getBytes()); - read.setBaseQualityString(new String(QUALS)); - - reducedRead = ArtificialSAMUtils.createArtificialRead(header, "reducedRead", 0, 1, BASES.length()); - reducedRead.setReadBases(BASES.getBytes()); - reducedRead.setBaseQualityString(QUALS); - reducedRead.setAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, REDUCED_READ_COUNTS_TAG); - } - - @Test - public void testReducedReads() { - Assert.assertFalse(read.isReducedRead(), "isReducedRead is false for normal read"); - Assert.assertEquals(read.getReducedReadCounts(), null, "No reduced read tag in normal read"); - - Assert.assertTrue(reducedRead.isReducedRead(), "isReducedRead is true for reduced read"); - for (int i = 0; i < reducedRead.getReadLength(); i++) { - Assert.assertEquals(reducedRead.getReducedCount(i), REDUCED_READ_COUNTS[i], "Reduced read count not set to the expected value at " + i); - } - } - - @Test - public void testReducedReadPileupElement() { - PileupElement readp = new PileupElement(read, 0, false, false, false); - PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false); - - Assert.assertFalse(readp.getRead().isReducedRead()); - - Assert.assertTrue(reducedreadp.getRead().isReducedRead()); - Assert.assertEquals(reducedreadp.getRepresentativeCount(), REDUCED_READ_COUNTS[0]); - Assert.assertEquals(reducedreadp.getQual(), readp.getQual()); - } - @Test public void testGetAdaptorBoundary() { final byte[] bases = {'A', 'C', 'G', 'T', 'A', 'C', 'G', 'T'}; From 2a565ebf90ea53e0a252fe097c778365aff3714a Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 26 Jan 2012 19:58:42 -0500 Subject: [PATCH 09/22] embarrassing fix-up, thanks Khalid. --- .../broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java index 317b320d3..729503f84 100755 --- a/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/GATKSAMRecordUnitTest.java @@ -42,8 +42,8 @@ public class GATKSAMRecordUnitTest extends BaseTest { @Test public void testReducedReadPileupElement() { - PileupElement readp = new PileupElement(read, 0, false); - PileupElement reducedreadp = new PileupElement(reducedRead, 0, false); + PileupElement readp = new PileupElement(read, 0, false, false, false); + PileupElement reducedreadp = new PileupElement(reducedRead, 0, false, false, false); Assert.assertFalse(readp.getRead().isReducedRead()); From cb04c0bf1136b351c703e9f3486527301f086067 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 27 Jan 2012 08:20:45 -0500 Subject: [PATCH 10/22] Removing javaassist 3.7, lucene library dependancies --- ivy.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ivy.xml b/ivy.xml index f5ff15c30..f7c64aec6 100644 --- a/ivy.xml +++ b/ivy.xml @@ -41,7 +41,7 @@ - + @@ -66,7 +66,7 @@ - + From 13d1626f51878d3973d43dd27a9a78e10119878d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 27 Jan 2012 08:24:17 -0500 Subject: [PATCH 11/22] Minor improvements in ref QC walker. Unfortunately this doesn't actually catch Chris's error --- .../sting/gatk/walkers/qc/QCRefWalker.java | 30 ++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java index bddf27d84..ab5324e39 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/QCRefWalker.java @@ -83,7 +83,7 @@ public class QCRefWalker extends RefWalker { } private final void throwError(ReferenceContext ref, String message) { - throw new StingException(String.format("Site %s failed: %s", ref, message)); + throw new StingException(String.format("Site %s failed: %s", ref.getLocus(), message)); } public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { @@ -92,13 +92,13 @@ public class QCRefWalker extends RefWalker { contigName = locusContigName; ReferenceSequence refSeq = uncachedRef.getSequence(contigName); contigStart = 1; - contigEnd = contigStart + refSeq.length(); + contigEnd = contigStart + refSeq.length() - 1; uncachedBases = uncachedRef.getSubsequenceAt(contigName, contigStart, contigEnd).getBases(); - logger.warn(String.format("Loading contig %s (%d-%d)", contigName, contigStart, contigEnd)); + logger.info(String.format("Loading contig %s (%d-%d)", contigName, contigStart, contigEnd)); } final byte refBase = ref.getBase(); - if (! ( BaseUtils.isRegularBase(refBase) || BaseUtils.isNBase(refBase) ) ) + if (! ( BaseUtils.isRegularBase(refBase) || isExtendFastaBase(refBase) ) ) throwError(ref, String.format("Refbase isn't a regular base (%d %c)", refBase, (char)refBase)); // check bases are equal @@ -114,6 +114,28 @@ public class QCRefWalker extends RefWalker { return 1; } + private static final boolean isExtendFastaBase(final byte b) { + switch ( b ) { + case 'U': + case 'R': + case 'Y': + case 'K': + case 'M': + case 'S': + case 'W': + case 'B': + case 'D': + case 'H': + case 'V': + case 'N': + case 'X': + case '-': + return true; + default: + return false; + } + } + public Integer reduceInit() { return 0; } From ec9920b04f7a95918ed3b76a4e971143ed8084b0 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 27 Jan 2012 08:51:39 -0500 Subject: [PATCH 12/22] Updating the SAM TAG for Original Alignment Start to "OP" per Mark's recommendation to reuse the Indel Realigner tag that made it to the SAM spec. The Alignment end tag is still "OE" as there is no official tag to reuse. --- .../src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index f17772f40..03b794ae3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -45,7 +45,7 @@ import java.util.Map; public class GATKSAMRecord extends BAMRecord { // ReduceReads specific attribute tags public static final String REDUCED_READ_CONSENSUS_TAG = "RR"; // marks a synthetic read produced by the ReduceReads tool - public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OS"; // reads that are clipped may use this attribute to keep track of their original alignment start + public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT = "OP"; // reads that are clipped may use this attribute to keep track of their original alignment start public static final String REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT = "OE"; // reads that are clipped may use this attribute to keep track of their original alignment end // the SAMRecord data we're caching From 052a4bdb9cfd5f182e0a4e779f357ea63ea4449f Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Fri, 27 Jan 2012 11:13:30 -0500 Subject: [PATCH 14/22] Turning off PHONE HOME option in the MDCP * MDCP is for internal use and there is no need to report to the Amazon cloud. * Reporting to ASW_S3 is not allowing jobs to finish, this is probably a bug. --- .../queue/qscripts/MethodsDevelopmentCallingPipeline.scala | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala index 2f0715ae9..b860358ca 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala @@ -25,9 +25,6 @@ class MethodsDevelopmentCallingPipeline extends QScript { @Argument(shortName="noIndels", doc="do not call indels with the Unified Genotyper", required=false) var noIndels: Boolean = false - @Argument(shortName="LOCAL_ET", doc="Doesn't use the AWS S3 storage for ET option", required=false) - var LOCAL_ET: Boolean = false - @Argument(shortName="mbq", doc="The minimum Phred-Scaled quality score threshold to be considered a good base.", required=false) var minimumBaseQuality: Int = -1 @@ -203,7 +200,7 @@ class MethodsDevelopmentCallingPipeline extends QScript { trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; memoryLimit = 4; - phone_home = if ( LOCAL_ET ) GATKRunReport.PhoneHomeOption.STANDARD else GATKRunReport.PhoneHomeOption.AWS_S3 + phone_home = GATKRunReport.PhoneHomeOption.NO_ET } def bai(bam: File) = new File(bam + ".bai") From fc08235ff374e3eba8d326c67e9698f9c9280523 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 27 Jan 2012 15:12:37 -0500 Subject: [PATCH 15/22] Bug fix in active region traversal, locusView.getNext() skips over pileups with zero coverage but still need to count them in the active probability integrator --- .../traversals/TraverseActiveRegions.java | 43 +++++++++++-------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 83daa4d80..562a6d1d0 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -10,7 +10,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -63,25 +62,26 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends TraversalEngine extends TraversalEngine maxVal ) { maxVal = activeProbArray[jjj]; } } filteredProbArray[iii] = maxVal; @@ -241,14 +242,18 @@ public class TraverseActiveRegions extends TraversalEngine ACTIVE_PROB_THRESHOLD; if( curStatus != thisStatus || (iii-curStart) > MAX_ACTIVE_REGION ) { - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)), + returnList.add( new ActiveRegion( + engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (iii-1)), curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); curStatus = thisStatus; curStart = iii; } } - returnList.add( new ActiveRegion( engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)), - curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); + if( curStart != filteredProbArray.length-1 ) { + returnList.add( new ActiveRegion( + engine.getGenomeLocParser().createGenomeLoc(firstIsActiveStart.getContig(), firstIsActiveStart.getStart() + curStart, firstIsActiveStart.getStart() + (filteredProbArray.length-1)), + curStatus, engine.getGenomeLocParser(), activeRegionExtension ) ); + } return returnList; } } From a9671b73ca0fc50d7c2722727e90d731d38037aa Mon Sep 17 00:00:00 2001 From: Menachem Fromer Date: Fri, 27 Jan 2012 16:01:30 -0500 Subject: [PATCH 16/22] Fix to permit proper handling of mapping qualities between 128 to 255 (which get converted to byte values of -128 to -1) --- .../org/broadinstitute/sting/utils/QualityUtils.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 19e03a19d..7ec6a74d7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -17,7 +17,7 @@ public class QualityUtils { private static double qualToErrorProbCache[] = new double[256]; static { - for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw((byte)i); + for (int i = 0; i < 256; i++) qualToErrorProbCache[i] = qualToErrorProbRaw(i); } /** @@ -44,15 +44,15 @@ public class QualityUtils { * Convert a quality score to a probability of error. This is the Phred-style * conversion, *not* the Illumina-style conversion (though asymptotically, they're the same). * - * @param qual a quality score (0-40) - * @return a probability (0.0-1.0) + * @param qual a quality score (0 - 255) + * @return a probability (0.0 - 1.0) */ - static public double qualToErrorProbRaw(byte qual) { + static private double qualToErrorProbRaw(int qual) { return Math.pow(10.0, ((double) qual)/-10.0); } static public double qualToErrorProb(byte qual) { - return qualToErrorProbCache[qual]; + return qualToErrorProbCache[(int)qual & 0xff]; // Map: 127 -> 127; -128 -> 128; -1 -> 255; etc. } /** From 3164c8dee57cb84a3c60c38f67e196a7fc25038e Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 29 Jan 2012 15:14:58 -0500 Subject: [PATCH 17/22] S3 upload now directly creates the XML report in memory and puts that in S3 -- This is a partial fix for the problem with uploading S3 logs reported by Mauricio. There the problem is that the java.io.tmpdir is not accessible (network just hangs). Because of that the s3 upload fails because the underlying system uses tmpdir for caching, etc. As far as I can tell there's no way around this bug -- you cannot overload the java.io.tmpdir programmatically and even if I could what value would we use? The only solution seems to me is to detect that tmpdir is hanging (how?!) and fail with a meaningful error. --- ivy.xml | 2 +- .../sting/gatk/phonehome/GATKRunReport.java | 103 +++++++++--------- 2 files changed, 50 insertions(+), 55 deletions(-) diff --git a/ivy.xml b/ivy.xml index f7c64aec6..06296c6b4 100644 --- a/ivy.xml +++ b/ivy.xml @@ -72,7 +72,7 @@ - + diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index f09865537..e8627ef4c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -264,22 +264,8 @@ public class GATKRunReport { } } - /** - * Opens the destination file and writes a gzipped version of the XML report there. - * - * @param destination - * @throws IOException - */ - private void postReportToFile(File destination) throws IOException { - BufferedOutputStream out = - new BufferedOutputStream( - new GZIPOutputStream( - new FileOutputStream(destination))); - try { - postReportToStream(out); - } finally { - out.close(); - } + private final String getKey() { + return getID() + ".report.xml.gz"; } /** @@ -288,16 +274,21 @@ public class GATKRunReport { * That is, postReport() is guarenteed not to fail for any reason. */ private File postReportToLocalDisk(File rootDir) { - String filename = getID() + ".report.xml.gz"; - File file = new File(rootDir, filename); + final String filename = getKey(); + final File destination = new File(rootDir, filename); + try { - postReportToFile(file); - logger.debug("Wrote report to " + file); - return file; + final BufferedOutputStream out = new BufferedOutputStream( + new GZIPOutputStream( + new FileOutputStream(destination))); + postReportToStream(out); + out.close(); + logger.debug("Wrote report to " + destination); + return destination; } catch ( Exception e ) { // we catch everything, and no matter what eat the error exceptDuringRunReport("Couldn't read report file", e); - file.delete(); + destination.delete(); return null; } } @@ -305,42 +296,46 @@ public class GATKRunReport { private void postReportToAWSS3() { // modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html this.hostName = Utils.resolveHostname(); // we want to fill in the host name - File localFile = postReportToLocalDisk(new File("./")); - logger.debug("Generating GATK report to AWS S3 based on local file " + localFile); - if ( localFile != null ) { // we succeeded in creating the local file - localFile.deleteOnExit(); - try { - // stop us from printing the annoying, and meaningless, mime types warning - Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); - mimeTypeLogger.setLevel(Level.FATAL); + final String key = getKey(); + logger.debug("Generating GATK report to AWS S3 with key " + key); + try { + // create an byte output stream so we can capture the output as a byte[] + final ByteArrayOutputStream byteStream = new ByteArrayOutputStream(8096); + final OutputStream outputStream = new GZIPOutputStream(byteStream); + postReportToStream(outputStream); + outputStream.close(); + final byte[] report = byteStream.toByteArray(); - // Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials - // are stored in an AWSCredentials object: + // stop us from printing the annoying, and meaningless, mime types warning + Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class); + mimeTypeLogger.setLevel(Level.FATAL); - // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket - String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user - String awsSecretKey = "uQLTduhK6Gy8mbOycpoZIxr8ZoVj1SQaglTWjpYA"; // GATK AWS user - AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); + // Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials + // are stored in an AWSCredentials object: - // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP - // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t. - S3Service s3Service = new RestS3Service(awsCredentials); + // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket + String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user + String awsSecretKey = "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"; // GATK AWS user + AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); - // Create an S3Object based on a file, with Content-Length set automatically and - // Content-Type set based on the file's extension (using the Mimetypes utility class) - S3Object fileObject = new S3Object(localFile); - //logger.info("Created S3Object" + fileObject); - //logger.info("Uploading " + localFile + " to AWS bucket"); - S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject); - logger.debug("Uploaded to AWS: " + s3Object); - logger.info("Uploaded run statistics report to AWS S3"); - } catch ( S3ServiceException e ) { - exceptDuringRunReport("S3 exception occurred", e); - } catch ( NoSuchAlgorithmException e ) { - exceptDuringRunReport("Couldn't calculate MD5", e); - } catch ( IOException e ) { - exceptDuringRunReport("Couldn't read report file", e); - } + // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP + // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t. + S3Service s3Service = new RestS3Service(awsCredentials); + + // Create an S3Object based on a file, with Content-Length set automatically and + // Content-Type set based on the file's extension (using the Mimetypes utility class) + S3Object fileObject = new S3Object(key, report); + //logger.info("Created S3Object" + fileObject); + //logger.info("Uploading " + localFile + " to AWS bucket"); + S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject); + logger.debug("Uploaded to AWS: " + s3Object); + logger.info("Uploaded run statistics report to AWS S3"); + } catch ( S3ServiceException e ) { + exceptDuringRunReport("S3 exception occurred", e); + } catch ( NoSuchAlgorithmException e ) { + exceptDuringRunReport("Couldn't calculate MD5", e); + } catch ( IOException e ) { + exceptDuringRunReport("Couldn't read report file", e); } } From d5d4fa8a88ee599ea716ccdcecc0e6892b93e6eb Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 30 Jan 2012 09:50:14 -0500 Subject: [PATCH 20/22] Fixed discordance bug reported by Brad Chapman discordance now reports discordance between genotypes as well (just like concordance) --- .../walkers/variantutils/SelectVariants.java | 20 +++++++++---------- .../SelectVariantsIntegrationTest.java | 2 +- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index 6d94ffe6d..5eef7fb66 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -25,22 +25,20 @@ package org.broadinstitute.sting.gatk.walkers.variantutils; import org.broadinstitute.sting.commandline.*; -import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; -import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.gatk.walkers.TreeReducible; -import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.utils.MendelianViolation; -import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.sting.utils.variantcontext.*; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.text.XReadLines; +import org.broadinstitute.sting.utils.variantcontext.*; import java.io.File; import java.io.FileNotFoundException; @@ -557,7 +555,7 @@ public class SelectVariants extends RodWalker implements TreeR // Look for this sample in the all vcs of the comp ROD track. boolean foundVariant = false; for (VariantContext compVC : compVCs) { - if (sampleHasVariant(compVC.getGenotype(g.getSampleName()))) { + if (haveSameGenotypes(g, compVC.getGenotype(g.getSampleName()))) { foundVariant = true; break; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index 042de2a27..9577966b7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -58,7 +58,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + hg19Reference + " -sn NA12878 -L 20:1012700-1020000 --variant " + b37hapmapGenotypes + " -disc " + testFile + " -o %s -NO_HEADER", 1, - Arrays.asList("78e6842325f1f1bc9ab30d5e7737ee6e") + Arrays.asList("929bbb96381541c162dc7e5462e26ea2") ); executeTest("testDiscordance--" + testFile, spec); From 17dbe9a95dd1f638d050a4dc9cf41227fdf562c1 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Mon, 30 Jan 2012 21:37:02 -0500 Subject: [PATCH 22/22] A few cleanups in the LocusIteratorByState * No more N's in the extended event pileups * Only add to the pileup MQ0 counter if the read actually goes into the pileup --- .../gatk/iterators/LocusIteratorByState.java | 71 ++++++++++--------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java index 2257cc139..34ac17f49 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java +++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java @@ -74,16 +74,16 @@ public class LocusIteratorByState extends LocusIterator { static private class SAMRecordState { SAMRecord read; - int readOffset = -1; // how far are we offset from the start of the read bases? - int genomeOffset = -1; // how far are we offset from the alignment start on the genome? + int readOffset = -1; // how far are we offset from the start of the read bases? + int genomeOffset = -1; // how far are we offset from the alignment start on the genome? Cigar cigar = null; int cigarOffset = -1; CigarElement curElement = null; int nCigarElements = 0; - // how far are we into a single cigarElement - int cigarElementCounter = -1; + + int cigarElementCounter = -1; // how far are we into a single cigarElement // The logical model for generating extended events is as follows: the "record state" implements the traversal // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This @@ -93,19 +93,19 @@ public class LocusIteratorByState extends LocusIterator { // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended // events immediately preceding the current reference base). - boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases? - // the only purpose of this flag is to shield away a few additional lines of code - // when extended piles are not needed, it may not be even worth it... + boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases? + // the only purpose of this flag is to shield away a few additional lines of code + // when extended piles are not needed, it may not be even worth it... - byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels) - int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events - byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the - // current base on the ref. We use a counter-like variable here since clearing the indel event is - // delayed by one base, so we need to remember how long ago we have seen the actual event + byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels) + int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events + byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the + // current base on the ref. We use a counter-like variable here since clearing the indel event is + // delayed by one base, so we need to remember how long ago we have seen the actual event - int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the - // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly, - // we cache it here mainly for convenience + int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the + // event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly, + // we cache it here mainly for convenience public SAMRecordState(SAMRecord read, boolean extended) { @@ -241,6 +241,8 @@ public class LocusIteratorByState extends LocusIterator { readOffset += curElement.getLength(); break; case D: // deletion w.r.t. the reference + if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string + throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString()); if (generateExtendedEvents) { if (cigarElementCounter == 1) { // generate an extended event only if we just stepped into the deletion (i.e. don't @@ -403,9 +405,9 @@ public class LocusIteratorByState extends LocusIterator { final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. final int eventLength = state.getEventLength(); -// if (op != CigarOperator.N) // N's are never added to any pileup -// continue; -// + if (op == CigarOperator.N) // N's are never added to any pileup + continue; + if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref size++; ExtendedEventPileupElement pileupElement; @@ -413,27 +415,26 @@ public class LocusIteratorByState extends LocusIterator { nDeletions++; maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength()); pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength); - } + } else { // Insertion event nInsertions++; pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases()); } + if (read.getMappingQuality() == 0) + nMQ0Reads++; indelPile.add(pileupElement); } - // this read has no indel associated with the previous position on the ref. Criteria to include in the pileup are: - // we only add reads that are not N's - // we only include deletions to the pileup if the walker requests it - else if ( (op != CigarOperator.N) && (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci())) { + // this read has no indel so add it to the pileup as a NOEVENT: + // a deletion that didn't start here (therefore, not an extended event) + // we add (mis)matches as no events. + else if (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci()) { size++; indelPile.add(new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), readOffset)); + if (read.getMappingQuality() == 0) + nMQ0Reads++; } - - - if (state.getRead().getMappingQuality() == 0) - nMQ0Reads++; - } if (indelPile.size() != 0) @@ -461,25 +462,25 @@ public class LocusIteratorByState extends LocusIterator { final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator final CigarOperator nextOp = state.peekForwardOnGenome(); // next cigar operator final int readOffset = state.getReadOffset(); // the base offset on this read - final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began. if (op == CigarOperator.N) // N's are never added to any pileup continue; - if (read.getMappingQuality() == 0) - nMQ0Reads++; - if (op == CigarOperator.D) { if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so - int leftAlignedStart = (eventStartOffset < 0) ? readOffset : eventStartOffset; - pile.add(new PileupElement(read, leftAlignedStart, true, nextOp == CigarOperator.I, false)); + pile.add(new PileupElement(read, readOffset, true, nextOp == CigarOperator.I, false)); size++; nDeletions++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; } - } else { + } + else { if (!filterBaseInRead(read, location.getStart())) { pile.add(new PileupElement(read, readOffset, false, nextOp == CigarOperator.I, op == CigarOperator.S)); size++; + if (read.getMappingQuality() == 0) + nMQ0Reads++; } } }