From f6d5499582e707e9a86f7c654e340f1c474485c1 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Tue, 18 Dec 2012 15:46:34 -0500 Subject: [PATCH 01/22] The GATK engine now ensures that incoming GATKSAMRecords have GATKSAMReadGroupRecord objects in their header -- Update SAMDataSource so that the merged header contains GATKSAMReadGroupRecord -- Now getting the NGSPlatform for a GATKSAMRecord is actually efficient, instead of computing the NGS platform over and over from the PL string -- Updated a few places in the code where the input argument is actually a GATKSAMRecord, not a SAMRecord for type safety --- .../gatk/datasources/reads/SAMDataSource.java | 10 +++ .../sting/gatk/filters/Platform454Filter.java | 3 +- .../sting/gatk/filters/PlatformFilter.java | 3 +- .../indels/HaplotypeIndelErrorModel.java | 3 +- .../gatk/walkers/indels/IndelRealigner.java | 4 +- .../sting/utils/NGSPlatform.java | 29 ++---- .../utils/sam/GATKSAMReadGroupRecord.java | 90 +++++++++---------- .../sting/utils/sam/GATKSAMRecord.java | 24 ++--- .../sting/utils/sam/ReadUtils.java | 10 +-- 9 files changed, 88 insertions(+), 88 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index e99814278..e3f197716 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -44,6 +44,7 @@ import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.baq.ReadTransformingIterator; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import java.io.File; @@ -894,9 +895,11 @@ public class SAMDataSource { long lastTick = timer.currentTime(); for(final SAMReaderID readerID: readerIDs) { final ReaderInitializer init = new ReaderInitializer(readerID).call(); + if (removeProgramRecords) { init.reader.getFileHeader().setProgramRecords(new ArrayList()); } + if (threadAllocation.getNumIOThreads() > 0) { inputStreams.put(init.readerID, init.blockInputStream); // get from initializer } @@ -916,6 +919,13 @@ public class SAMDataSource { for(SAMFileReader reader: readers.values()) headers.add(reader.getFileHeader()); headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate,headers,true); + + // update all read groups to GATKSAMRecordReadGroups + final List gatkReadGroups = new LinkedList(); + for ( final SAMReadGroupRecord rg : headerMerger.getMergedHeader().getReadGroups() ) { + gatkReadGroups.add(new GATKSAMReadGroupRecord(rg)); + } + headerMerger.getMergedHeader().setReadGroups(gatkReadGroups); } final private void printReaderPerformance(final int nExecutedTotal, diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java index 8ad91ac1c..a1f2a877b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/Platform454Filter.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.filters; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; /** @@ -37,6 +38,6 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; public class Platform454Filter extends ReadFilter { public boolean filterOut(SAMRecord rec) { - return (ReadUtils.is454Read(rec)); + return (ReadUtils.is454Read((GATKSAMRecord)rec)); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java index 8e241bb2c..de5be94bc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/PlatformFilter.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.filters; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; /** @@ -41,7 +42,7 @@ public class PlatformFilter extends ReadFilter { public boolean filterOut(SAMRecord rec) { for ( String name : PLFilterNames ) - if ( ReadUtils.isPlatformRead(rec, name.toUpperCase() )) + if ( ReadUtils.isPlatformRead((GATKSAMRecord)rec, name.toUpperCase() )) return true; return false; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index 795fc76ed..3c1bc338a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.variant.variantcontext.Allele; @@ -421,7 +422,7 @@ public class HaplotypeIndelErrorModel { double[][] haplotypeLikehoodMatrix = new double[haplotypesInVC.size()][haplotypesInVC.size()]; double readLikelihoods[][] = new double[pileup.getReads().size()][haplotypesInVC.size()]; int i=0; - for (SAMRecord read : pileup.getReads()) { + for (GATKSAMRecord read : pileup.getReads()) { if(ReadUtils.is454Read(read)) { continue; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 5d19ac5e8..15d3f43fd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -529,7 +529,7 @@ public class IndelRealigner extends ReadWalker { sawReadInCurrentInterval = false; } - private boolean doNotTryToClean(SAMRecord read) { + private boolean doNotTryToClean(GATKSAMRecord read) { return read.getReadUnmappedFlag() || read.getNotPrimaryAlignmentFlag() || read.getReadFailsVendorQualityCheckFlag() || @@ -835,7 +835,7 @@ public class IndelRealigner extends ReadWalker { // TODO -- get rid of this try block when Picard does the right thing for reads aligned off the end of the reference try { if ( read.getAttribute(SAMTag.NM.name()) != null ) - read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, reference, leftmostIndex-1)); + read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, reference, leftmostIndex - 1)); if ( read.getAttribute(SAMTag.UQ.name()) != null ) read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, reference, leftmostIndex-1)); } catch (Exception e) { diff --git a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java index 504704e55..847d8067c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java +++ b/public/java/src/org/broadinstitute/sting/utils/NGSPlatform.java @@ -24,8 +24,7 @@ package org.broadinstitute.sting.utils; -import net.sf.samtools.SAMReadGroupRecord; -import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** * A canonical, master list of the standard NGS platforms. These values @@ -64,25 +63,15 @@ public enum NGSPlatform { } /** - * Convenience constructor -- calculates the NGSPlatfrom from a SAMRecord. - * Note you should not use this function if you have a GATKSAMRecord -- use the - * accessor method instead. + * Convenience get -- get the NGSPlatfrom from a SAMRecord. * - * @param read + * Just gets the platform from the GATKReadGroupRecord associated with this read. + * + * @param read a GATKSAMRecord * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match */ - public static final NGSPlatform fromRead(SAMRecord read) { - return fromReadGroup(read.getReadGroup()); - } - - /** - * Returns the NGSPlatform corresponding to the PL tag in the read group - * @param rg - * @return an NGSPlatform object matching the PL field of the header, of UNKNOWN if there was no match - */ - public static final NGSPlatform fromReadGroup(SAMReadGroupRecord rg) { - if ( rg == null ) return UNKNOWN; - return fromReadGroupPL(rg.getPlatform()); + public static NGSPlatform fromRead(GATKSAMRecord read) { + return read.getReadGroup().getNGSPlatform(); } /** @@ -90,7 +79,7 @@ public enum NGSPlatform { * @param plFromRG -- the PL field (or equivalent) in a ReadGroup object * @return an NGSPlatform object matching the PL field of the header, or UNKNOWN if there was no match */ - public static final NGSPlatform fromReadGroupPL(final String plFromRG) { + public static NGSPlatform fromReadGroupPL(final String plFromRG) { if ( plFromRG == null ) return UNKNOWN; // todo -- algorithm could be implemented more efficiently, as the list of all @@ -113,7 +102,7 @@ public enum NGSPlatform { * @param platform the read group string that describes the platform used * @return true if the platform is known (i.e. it's in the list and is not UNKNOWN) */ - public static final boolean isKnown (final String platform) { + public static final boolean isKnown(final String platform) { return fromReadGroupPL(platform) != UNKNOWN; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java index bb99156fe..5f70ced92 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMReadGroupRecord.java @@ -12,9 +12,6 @@ import org.broadinstitute.sting.utils.NGSPlatform; * */ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { - - public static final String LANE_TAG = "LN"; - // the SAMReadGroupRecord data we're caching private String mSample = null; private String mPlatform = null; @@ -33,46 +30,14 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { super(record.getReadGroupId(), record); } - public GATKSAMReadGroupRecord(SAMReadGroupRecord record, NGSPlatform pl) { - super(record.getReadGroupId(), record); - setPlatform(pl.getDefaultPlatform()); - mNGSPlatform = pl; - retrievedPlatform = retrievedNGSPlatform = true; - } - - /////////////////////////////////////////////////////////////////////////////// - // *** The following methods are overloaded to cache the appropriate data ***// - /////////////////////////////////////////////////////////////////////////////// - - public String getSample() { - if ( !retrievedSample ) { - mSample = super.getSample(); - retrievedSample = true; - } - return mSample; - } - - public void setSample(String s) { - super.setSample(s); - mSample = s; - retrievedSample = true; - } - - public String getPlatform() { - if ( !retrievedPlatform ) { - mPlatform = super.getPlatform(); - retrievedPlatform = true; - } - return mPlatform; - } - - public void setPlatform(String s) { - super.setPlatform(s); - mPlatform = s; - retrievedPlatform = true; - retrievedNGSPlatform = false; // recalculate the NGSPlatform - } - + /** + * Get the NGSPlatform enum telling us the platform of this read group + * + * This function call is caching, so subsequent calls to it are free, while + * the first time it's called there's a bit of work to resolve the enum + * + * @return an NGSPlatform enum value + */ public NGSPlatform getNGSPlatform() { if ( ! retrievedNGSPlatform ) { mNGSPlatform = NGSPlatform.fromReadGroupPL(getPlatform()); @@ -82,11 +47,40 @@ public class GATKSAMReadGroupRecord extends SAMReadGroupRecord { return mNGSPlatform; } - public String getLane() { - return this.getAttribute(LANE_TAG); + /////////////////////////////////////////////////////////////////////////////// + // *** The following methods are overloaded to cache the appropriate data ***// + /////////////////////////////////////////////////////////////////////////////// + + @Override + public String getSample() { + if ( !retrievedSample ) { + mSample = super.getSample(); + retrievedSample = true; + } + return mSample; } - - public void setLane(String lane) { - this.setAttribute(LANE_TAG, lane); + + @Override + public void setSample(String s) { + super.setSample(s); + mSample = s; + retrievedSample = true; + } + + @Override + public String getPlatform() { + if ( !retrievedPlatform ) { + mPlatform = super.getPlatform(); + retrievedPlatform = true; + } + return mPlatform; + } + + @Override + public void setPlatform(String s) { + super.setPlatform(s); + mPlatform = s; + retrievedPlatform = true; + retrievedNGSPlatform = false; // recalculate the NGSPlatform } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index 9fdb48b34..a83aca8f3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -25,9 +25,9 @@ package org.broadinstitute.sting.utils.sam; import net.sf.samtools.*; -import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.EventType; import java.util.Arrays; import java.util.HashMap; @@ -141,16 +141,26 @@ public class GATKSAMRecord extends BAMRecord { mReadString = s; } + /** + * Get the GATKSAMReadGroupRecord of this read + * @return a non-null GATKSAMReadGroupRecord + */ @Override public GATKSAMReadGroupRecord getReadGroup() { - if ( !retrievedReadGroup ) { - SAMReadGroupRecord tempReadGroup = super.getReadGroup(); - mReadGroup = (tempReadGroup == null ? null : new GATKSAMReadGroupRecord(tempReadGroup)); + if ( ! retrievedReadGroup ) { + mReadGroup = (GATKSAMReadGroupRecord)super.getReadGroup(); retrievedReadGroup = true; } return mReadGroup; } + public void setReadGroup( final GATKSAMReadGroupRecord readGroup ) { + mReadGroup = readGroup; + retrievedReadGroup = true; + setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! + } + + @Override public int hashCode() { return super.hashCode(); @@ -259,12 +269,6 @@ public class GATKSAMRecord extends BAMRecord { return getReadGroup().getNGSPlatform(); } - public void setReadGroup( final GATKSAMReadGroupRecord readGroup ) { - mReadGroup = readGroup; - retrievedReadGroup = true; - setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! - } - /////////////////////////////////////////////////////////////////////////////// // *** ReduceReads functions ***// /////////////////////////////////////////////////////////////////////////////// diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index bd908727f..263cd9bd1 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -226,7 +226,7 @@ public class ReadUtils { * @param read the read to test * @return checks the read group tag PL for the default 454 tag */ - public static boolean is454Read(SAMRecord read) { + public static boolean is454Read(GATKSAMRecord read) { return NGSPlatform.fromRead(read) == NGSPlatform.LS454; } @@ -236,7 +236,7 @@ public class ReadUtils { * @param read the read to test * @return checks the read group tag PL for the default ion tag */ - public static boolean isIonRead(SAMRecord read) { + public static boolean isIonRead(GATKSAMRecord read) { return NGSPlatform.fromRead(read) == NGSPlatform.ION_TORRENT; } @@ -246,7 +246,7 @@ public class ReadUtils { * @param read the read to test * @return checks the read group tag PL for the default SOLiD tag */ - public static boolean isSOLiDRead(SAMRecord read) { + public static boolean isSOLiDRead(GATKSAMRecord read) { return NGSPlatform.fromRead(read) == NGSPlatform.SOLID; } @@ -256,7 +256,7 @@ public class ReadUtils { * @param read the read to test * @return checks the read group tag PL for the default SLX tag */ - public static boolean isIlluminaRead(SAMRecord read) { + public static boolean isIlluminaRead(GATKSAMRecord read) { return NGSPlatform.fromRead(read) == NGSPlatform.ILLUMINA; } @@ -268,7 +268,7 @@ public class ReadUtils { * @param name the upper-cased platform name to test * @return whether or not name == PL tag in the read group of read */ - public static boolean isPlatformRead(SAMRecord read, String name) { + public static boolean isPlatformRead(GATKSAMRecord read, String name) { SAMReadGroupRecord readGroup = read.getReadGroup(); if (readGroup != null) { From 0f0188ddb1704d5cdb7849602df0fad113c31a74 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 19 Dec 2012 08:15:42 -0500 Subject: [PATCH 02/22] Optimization of BQSR -- Created a ReadRecalibrationInfo class that holds all of the information (read, base quality vectors, error vectors) for a read for the call to updateDataForRead in RecalibrationEngine. This object has a restrictive interface to just get information about specific qual and error values at offset and for event type. This restrict allows us to avoid creating an vector of byte 45 for each read to represent BI and BD values not in the reads. Shaves 5% of the runtime off the entire code. -- Cleaned up code and added lots more docs -- With this commit we no longer have much in the way of low-hanging fruit left in the optimization of BQSR. 95% of the runtime is spent in BAQing the read, and updating the RecalData in the NestedIntegerArrays. --- .../bqsr/AdvancedRecalibrationEngine.java | 31 +--- .../gatk/walkers/bqsr/BaseRecalibrator.java | 15 +- .../walkers/bqsr/ReadRecalibrationInfo.java | 157 ++++++++++++++++++ .../walkers/bqsr/RecalibrationEngine.java | 26 ++- .../bqsr/StandardRecalibrationEngine.java | 42 +++-- .../sting/utils/sam/GATKSAMRecord.java | 10 +- 6 files changed, 223 insertions(+), 58 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index ac280b70e..b56e5cc16 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -28,41 +28,22 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.threading.ThreadLocalArray; public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - - // optimization: only allocate temp arrays once per thread - private final ThreadLocal threadLocalTempQualArray = new ThreadLocalArray(EventType.values().length, byte.class); - private final ThreadLocal threadLocalTempFractionalErrorArray = new ThreadLocalArray(EventType.values().length, double.class); - - public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { - super.initialize(covariates, recalibrationTables); - } - @Override - public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { - final ReadCovariates readCovariates = covariateKeySetFrom(read); - byte[] tempQualArray = threadLocalTempQualArray.get(); - double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get(); + public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { + final GATKSAMRecord read = recalInfo.getRead(); + final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - if( !skip[offset] ) { - tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; - tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; - tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; - tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset]; - tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset]; - tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset]; + if( ! recalInfo.skip(offset) ) { for (final EventType eventType : EventType.values()) { final int[] keys = readCovariates.getKeySet(offset, eventType); final int eventIndex = eventType.index; - final byte qual = tempQualArray[eventIndex]; - final double isError = tempFractionalErrorArray[eventIndex]; + final byte qual = recalInfo.getQual(eventType, offset); + final double isError = recalInfo.getErrorFraction(eventType, offset); incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 6931235b8..8a71872c1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -225,19 +225,22 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls return 0L; // skip this read completely } - read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates)); - final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases - final int[] isSNP = calculateIsSNP(read, ref, originalRead); - final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); - final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); final byte[] baqArray = calculateBAQArray(read); if( baqArray != null ) { // some reads just can't be BAQ'ed + final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates); + final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases + final int[] isSNP = calculateIsSNP(read, ref, originalRead); + final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); + final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray); final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray); final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray); - recalibrationEngine.updateDataForRead(read, skip, snpErrors, insertionErrors, deletionErrors); + + // aggregrate all of the info into our info object, and update the data + final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors); + recalibrationEngine.updateDataForRead(info); return 1L; } else { return 0L; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java new file mode 100644 index 000000000..f8d3c4281 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 12/18/12 + * Time: 3:50 PM + * + * TODO -- merge in ReadCovariates? + */ +public final class ReadRecalibrationInfo { + private final GATKSAMRecord read; + private final int length; + private final ReadCovariates covariates; + private final boolean[] skips; + private final byte[] baseQuals, insertionQuals, deletionQuals; + private final double[] snpErrors, insertionErrors, deletionErrors; + + public ReadRecalibrationInfo(final GATKSAMRecord read, + final ReadCovariates covariates, + final boolean[] skips, + final double[] snpErrors, + final double[] insertionErrors, + final double[] deletionErrors) { + if ( read == null ) throw new IllegalArgumentException("read cannot be null"); + if ( covariates == null ) throw new IllegalArgumentException("covariates cannot be null"); + if ( skips == null ) throw new IllegalArgumentException("skips cannot be null"); + if ( snpErrors == null ) throw new IllegalArgumentException("snpErrors cannot be null"); + // future: may allow insertionErrors && deletionErrors to be null, so don't enforce + + this.read = read; + this.baseQuals = read.getBaseQualities(); + this.length = baseQuals.length; + this.covariates = covariates; + this.skips = skips; + this.insertionQuals = read.getExistingBaseInsertionQualities(); + this.deletionQuals = read.getExistingBaseDeletionQualities(); + this.snpErrors = snpErrors; + this.insertionErrors = insertionErrors; + this.deletionErrors = deletionErrors; + + if ( skips.length != length ) throw new IllegalArgumentException("skips.length " + snpErrors.length + " != length " + length); + if ( snpErrors.length != length ) throw new IllegalArgumentException("snpErrors.length " + snpErrors.length + " != length " + length); + if ( insertionErrors != null && insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length); + if ( deletionErrors != null && deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length); + } + + /** + * Get the qual score for event type at offset + * + * @param eventType the type of event we want the qual for + * @param offset the offset into this read for the qual + * @return a valid quality score for event at offset + */ + @Requires("validOffset(offset)") + @Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE") + public byte getQual(final EventType eventType, final int offset) { + switch ( eventType ) { + case BASE_SUBSTITUTION: return baseQuals[offset]; + // note optimization here -- if we don't have ins/del quals we just return the default byte directly + case BASE_INSERTION: return insertionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : insertionQuals[offset]; + case BASE_DELETION: return deletionQuals == null ? GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL : deletionQuals[offset]; + default: throw new IllegalStateException("Unknown event type " + eventType); + } + } + + /** + * Get the error fraction for event type at offset + * + * The error fraction is a value between 0 and 1 that indicates how much certainty we have + * in the error occurring at offset. A value of 1 means that the error definitely occurs at this + * site, a value of 0.0 means it definitely doesn't happen here. 0.5 means that half the weight + * of the error belongs here + * + * @param eventType the type of event we want the qual for + * @param offset the offset into this read for the qual + * @return a fractional weight for an error at this offset + */ + @Requires("validOffset(offset)") + @Ensures("result >= 0.0 && result <= 1.0") + public double getErrorFraction(final EventType eventType, final int offset) { + switch ( eventType ) { + case BASE_SUBSTITUTION: return snpErrors[offset]; + case BASE_INSERTION: return insertionErrors[offset]; + case BASE_DELETION: return deletionErrors[offset]; + default: throw new IllegalStateException("Unknown event type " + eventType); + } + } + + /** + * Get the read involved in this recalibration info + * @return a non-null GATKSAMRecord + */ + @Ensures("result != null") + public GATKSAMRecord getRead() { + return read; + } + + /** + * Should offset in this read be skipped (because it's covered by a known variation site?) + * @param offset a valid offset into this info + * @return true if offset should be skipped, false otherwise + */ + @Requires("validOffset(offset)") + public boolean skip(final int offset) { + return skips[offset]; + } + + /** + * Get the ReadCovariates object carrying the mapping from offsets -> covariate key sets + * @return a non-null ReadCovariates object + */ + @Ensures("result != null") + public ReadCovariates getCovariatesValues() { + return covariates; + } + + /** + * Ensures an offset is valid. Used in contracts + * @param offset a proposed offset + * @return true if offset is valid w.r.t. the data in this object, false otherwise + */ + private boolean validOffset(final int offset) { + return offset >= 0 && offset < baseQuals.length; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 35375eb1d..5c002b7e5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; +import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -29,10 +30,31 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * OTHER DEALINGS IN THE SOFTWARE. */ public interface RecalibrationEngine { - + /** + * Initialize the recalibration engine + * + * Called once before any calls to updateDataForRead are made. The engine should prepare itself + * to handle any number of updateDataForRead calls containing ReadRecalibrationInfo containing + * keys for each of the covariates provided. + * + * The engine should collect match and mismatch data into the recalibrationTables data. + * + * @param covariates an array of the covariates we'll be using in this engine, order matters + * @param recalibrationTables the destination recalibrationTables where stats should be collected + */ public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables); - public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors); + /** + * Update the recalibration statistics using the information in recalInfo + * @param recalInfo data structure holding information about the recalibration values for a single read + */ + @Requires("recalInfo != null") + public void updateDataForRead(final ReadRecalibrationInfo recalInfo); + /** + * Finalize, if appropriate, all derived data in recalibrationTables. + * + * Called once after all calls to updateDataForRead have been issued. + */ public void finalizeData(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 1e166dfd0..a6ab98e8b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -35,34 +35,37 @@ import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; public class StandardRecalibrationEngine implements RecalibrationEngine, PublicPackageSource { - protected Covariate[] covariates; protected RecalibrationTables recalibrationTables; + @Override public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { + if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); + if ( recalibrationTables == null ) throw new IllegalArgumentException("recalibrationTables cannot be null"); + this.covariates = covariates.clone(); this.recalibrationTables = recalibrationTables; } @Override - public void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { + final GATKSAMRecord read = recalInfo.getRead(); + final EventType eventType = EventType.BASE_SUBSTITUTION; + final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - if( !skip[offset] ) { - final ReadCovariates readCovariates = covariateKeySetFrom(read); + if( ! recalInfo.skip(offset) ) { + final byte qual = recalInfo.getQual(eventType, offset); + final double isError = recalInfo.getErrorFraction(eventType, offset); + final int[] keys = readCovariates.getKeySet(offset, eventType); - final byte qual = read.getBaseQualities()[offset]; - final double isError = snpErrors[offset]; - - final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION); - final int eventIndex = EventType.BASE_SUBSTITUTION.index; - - incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); } } } @@ -79,16 +82,6 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP return new RecalDatum(1, isError, reportedQual); } - /** - * Get the covariate key set from a read - * - * @param read the read - * @return the covariate keysets for this read - */ - protected ReadCovariates covariateKeySetFrom(GATKSAMRecord read) { - return (ReadCovariates) read.getTemporaryAttribute(BaseRecalibrator.COVARS_ATTRIBUTE); - } - /** * Create derived recalibration data tables * @@ -129,7 +122,10 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP * @param isError error value for this event * @param keys location in table of our item */ - protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final double isError, final int... keys ) { + protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, + final byte qual, + final double isError, + final int... keys ) { final RecalDatum existingDatum = table.get(keys); if ( existingDatum == null ) { diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index a83aca8f3..ebb3c1ad0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -56,6 +56,12 @@ public class GATKSAMRecord extends BAMRecord { public static final String BQSR_BASE_INSERTION_QUALITIES = "BI"; // base qualities for insertions public static final String BQSR_BASE_DELETION_QUALITIES = "BD"; // base qualities for deletions + /** + * The default quality score for an insertion or deletion, if + * none are provided for this read. + */ + public static final byte DEFAULT_INSERTION_DELETION_QUAL = (byte)45; + // the SAMRecord data we're caching private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; @@ -239,7 +245,7 @@ public class GATKSAMRecord extends BAMRecord { byte [] quals = getExistingBaseInsertionQualities(); if( quals == null ) { quals = new byte[getBaseQualities().length]; - Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will + Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 } return quals; @@ -255,7 +261,7 @@ public class GATKSAMRecord extends BAMRecord { byte[] quals = getExistingBaseDeletionQualities(); if( quals == null ) { quals = new byte[getBaseQualities().length]; - Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will + Arrays.fill(quals, DEFAULT_INSERTION_DELETION_QUAL); // Some day in the future when base insertion and base deletion quals exist the samtools API will // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 } return quals; From f849910c4ed85af12a5a1ac4bce2c9ea3e31c078 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 19 Dec 2012 15:39:58 -0500 Subject: [PATCH 04/22] BQSR optimization: only compute BAQ when there's at least one error to delocalize -- Saves something like 2/3 of the compute cost of BQSR --- .../gatk/walkers/bqsr/BaseRecalibrator.java | 50 ++++++++++++++++--- .../walkers/bqsr/ReadRecalibrationInfo.java | 7 ++- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 8a71872c1..dcaaafefd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -135,6 +135,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation + private final static byte NO_BAQ_UNCERTAINTY = (byte)'@'; /** * Parse the -cov arguments and create a list of covariates to be used here @@ -226,19 +227,23 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche return 0L; // skip this read completely } - final byte[] baqArray = calculateBAQArray(read); + final int[] isSNP = calculateIsSNP(read, ref, originalRead); + final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); + final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); + final int nErrors = nEvents(isSNP, isInsertion, isDeletion); + + // note for efficiency regions we don't compute the BAQ array unless we actually have + // some error to marginalize over. For ILMN data ~85% of reads have no error + final byte[] baqArray = nErrors == 0 ? flatBAQArray(read) : calculateBAQArray(read); if( baqArray != null ) { // some reads just can't be BAQ'ed final ReadCovariates covariates = RecalUtils.computeCovariates(read, requestedCovariates); final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases - final int[] isSNP = calculateIsSNP(read, ref, originalRead); - final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); - final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray); final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray); final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray); - // aggregrate all of the info into our info object, and update the data + // aggregate all of the info into our info object, and update the data final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skip, snpErrors, insertionErrors, deletionErrors); recalibrationEngine.updateDataForRead(info); return 1L; @@ -247,6 +252,22 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche } } + /** + * Compute the number of mutational events across all hasEvent vectors + * + * Simply the sum of entries in hasEvents + * + * @param hasEvents a vector a vectors of 0 (no event) and 1 (has event) + * @return the total number of events across all hasEvent arrays + */ + private int nEvents(final int[]... hasEvents) { + int n = 0; + for ( final int[] hasEvent : hasEvents ) { + n += MathUtils.sum(hasEvent); + } + return n; + } + protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) { final byte[] bases = read.getReadBases(); final boolean[] skip = new boolean[bases.length]; @@ -374,7 +395,6 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche throw new ReviewedStingException("Array length mismatch detected. Malformed read?"); } - final byte NO_BAQ_UNCERTAINTY = (byte)'@'; final int BLOCK_START_UNSET = -1; final double[] fractionalErrors = new double[baqArray.length]; @@ -418,8 +438,24 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche } } + /** + * Create a BAQ style array that indicates no alignment uncertainty + * @param read the read for which we want a BAQ array + * @return a BAQ-style non-null byte[] counting NO_BAQ_UNCERTAINTY values + * // TODO -- could be optimized avoiding this function entirely by using this inline if the calculation code above + */ + private byte[] flatBAQArray(final GATKSAMRecord read) { + final byte[] baq = new byte[read.getReadLength()]; + Arrays.fill(baq, NO_BAQ_UNCERTAINTY); + return baq; + } + + /** + * Compute an actual BAQ array for read, based on its quals and the reference sequence + * @param read the read to BAQ + * @return a non-null BAQ tag array for read + */ private byte[] calculateBAQArray( final GATKSAMRecord read ) { - // todo -- it would be good to directly use the BAQ qualities rather than encoding and decoding the result and using the special @ value baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG); return BAQ.getBAQTag(read); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java index f8d3c4281..121e3449b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -84,7 +85,7 @@ public final class ReadRecalibrationInfo { * @return a valid quality score for event at offset */ @Requires("validOffset(offset)") - @Ensures("result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE") + @Ensures("validQual(result)") public byte getQual(final EventType eventType, final int offset) { switch ( eventType ) { case BASE_SUBSTITUTION: return baseQuals[offset]; @@ -154,4 +155,8 @@ public final class ReadRecalibrationInfo { private boolean validOffset(final int offset) { return offset >= 0 && offset < baseQuals.length; } + + private boolean validQual(final byte result) { + return result >= 0 && result <= QualityUtils.MAX_QUAL_SCORE; + } } From b92f563d06a0e0bc9418bf4785ec39954ef6cfd2 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 19 Dec 2012 15:51:51 -0500 Subject: [PATCH 05/22] NanoScheduler optimization for TraverseReadsNano -- Pre-read MapData into a list, which is actually faster than dealing with future lock contention issues with lots of map threads -- Increase the ReadShard default size to 100K reads by default --- .../gatk/datasources/reads/SAMDataSource.java | 7 ++++--- .../sting/gatk/traversals/TraverseReadsNano.java | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index e3f197716..5c932fdce 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -253,9 +253,10 @@ public class SAMDataSource { if(readBufferSize != null) ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests else { - // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively - // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. - ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000)); + // Choose a sensible default for the read buffer size. + // Previously we we're picked 100000 reads per BAM per shard with a max cap of 250K reads in memory at once. + // Now we are simply setting it to 100K reads + ReadShard.setReadBufferSize(100000); } resourcePool = new SAMResourcePool(Integer.MAX_VALUE); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index cd0198a29..ee71d82bb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -40,6 +40,7 @@ import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Iterator; +import java.util.LinkedList; /** * A nano-scheduling version of TraverseReads. @@ -53,6 +54,7 @@ import java.util.Iterator; */ public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { /** our log, which we want to capture anything from this class */ + private final static boolean PRE_READ_ALL_MAP_DATA = true; protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); private static final boolean DEBUG = false; final NanoScheduler nanoScheduler; @@ -111,7 +113,19 @@ public class TraverseReadsNano extends TraversalEngine, * should execute */ private Iterator aggregateMapData(final ReadShardDataProvider dataProvider) { - return new Iterator() { + final Iterator it = makeDataIterator(dataProvider); + if ( PRE_READ_ALL_MAP_DATA ) { + final LinkedList l = new LinkedList(); + while ( it.hasNext() ) l.add(it.next()); + return l.iterator(); + } else { + return it; + } + } + + + private Iterator makeDataIterator(final ReadShardDataProvider dataProvider) { + return new Iterator () { final ReadView reads = new ReadView(dataProvider); final ReadReferenceView reference = new ReadReferenceView(dataProvider); final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); From 0f04485c24eb77e5ea7159695e6532c7be44593f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 19 Dec 2012 16:17:51 -0500 Subject: [PATCH 06/22] NanoScheduler optimization: don't use a PriorityBlockingQueue for the MapResultsQueue -- Created a separate, limited interface MapResultsQueue object that previously was set to the PriorityBlockingQueue. -- The MapResultsQueue is now backed by a synchronized ExpandingArrayList, since job ids are integers incrementing from 0 to N. This means we avoid the n log n sort in the priority queue which was generating a lot of cost in the reduce step -- Had to update ReducerUnitTest because the test itself was brittle, and broken when I changed the underlying code. -- A few bits of minor code cleanup through the system (removing unused constructors, local variables, etc) -- ExpandingArrayList called ensureCapacity so that we increase the size of the arraylist once to accommodate the upcoming size needs --- .../utils/collections/ExpandingArrayList.java | 1 + .../utils/nanoScheduler/InputProducer.java | 6 - .../sting/utils/nanoScheduler/MapResult.java | 8 -- .../utils/nanoScheduler/MapResultsQueue.java | 116 ++++++++++++++++++ .../utils/nanoScheduler/NanoScheduler.java | 7 +- .../sting/utils/nanoScheduler/Reducer.java | 46 ++----- .../utils/nanoScheduler/ReducerUnitTest.java | 12 +- 7 files changed, 138 insertions(+), 58 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java b/public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java index 04ef8ece3..abd4eeaba 100755 --- a/public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/ExpandingArrayList.java @@ -54,6 +54,7 @@ public class ExpandingArrayList extends ArrayList { private void maybeExpand(int index, E value) { if ( index >= size() ) { + ensureCapacity(index+1); // make sure we have space to hold at least index + 1 elements // We need to add null items until we can safely set index to element for ( int i = size(); i <= index; i++ ) add(value); diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java index 84bb8d45f..ee5c46642 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -37,12 +37,6 @@ class InputProducer { int nRead = 0; int inputID = -1; - /** - * A latch used to block threads that want to start up only when all of the values - * in inputReader have been read by the thread executing run() - */ - final CountDownLatch latch = new CountDownLatch(1); - public InputProducer(final Iterator inputReader) { if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); this.inputReader = inputReader; diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java index 83d671560..4544f376c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java @@ -25,14 +25,6 @@ class MapResult extends EOFMarkedValue implements Comparable= 0"); } - /** - * Create the EOF marker version of MapResult - */ - MapResult() { - super(); - this.jobID = Integer.MAX_VALUE; - } - /** * @return the job ID of the map job that produced this MapResult */ diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java new file mode 100644 index 000000000..ef74d669d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResultsQueue.java @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.utils.collections.ExpandingArrayList; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 12/19/12 + * Time: 3:53 PM + * + * This class makes some critical assumptions. First is that the jobID of the first + * job is 0. If this isn't true the MapResultsQueue will certainly fail. + */ +public class MapResultsQueue { + //private final static boolean DEBUG = false; + //private final static Logger logger = Logger.getLogger(MapResultsQueue.class); + + /** + * Although naturally stored as priority blocking queue, this is actually quite expensive + * due to the O(n log n) sorting calculation. Since we know that the job ids start + * at 0 and increment by 1 in each successive job, we store an array instead. The + * array is indexed by jobID, and contains the MapResult for that job id. Because elements + * can be added to the queue in any order, we need to use an expanding array list to + * store the elements. + */ + final ExpandingArrayList> queue = new ExpandingArrayList>(10000); + + /** + * The jobID of the last job we've seen + */ + int prevJobID = -1; // no jobs observed + + /** + * Put mapResult into this MapResultsQueue, associated with its jobID + * @param mapResult a non-null map result + */ + public synchronized void put(final MapResult mapResult) { + if ( mapResult == null ) throw new IllegalArgumentException("mapResult cannot be null"); + + // make sure that nothing is at the job id for map + assert queue.size() < mapResult.getJobID() || queue.get(mapResult.getJobID()) == null; + + queue.set(mapResult.getJobID(), mapResult); + } + + /** + * Should we reduce the next value in the mapResultQueue? + * + * @return true if we should reduce + */ + public synchronized boolean nextValueIsAvailable() { + final MapResult nextMapResult = queue.get(nextJobID()); + + if ( nextMapResult == null ) { + // natural case -- the next job hasn't had a value added yet + return false; + } else if ( nextMapResult.getJobID() != nextJobID() ) { + // sanity check -- the job id at next isn't the one we expect + throw new IllegalStateException("Next job ID " + nextMapResult.getJobID() + " is not == previous job id " + prevJobID + " + 1"); + } else { + // there's a value at the next job id, so return true + return true; + } + } + + /** + * Get the next job ID'd be expect to see given our previous job id + * @return the next job id we'd fetch to reduce + */ + private int nextJobID() { + return prevJobID + 1; + } + + /** + * Can only be called when nextValueIsAvailable is true + * @return + * @throws InterruptedException + */ + // TODO -- does this have to be synchronized? -- I think the answer is no + public synchronized MapResult take() throws InterruptedException { + final MapResult result = queue.get(nextJobID()); + + // make sure the value we've fetched has the right id + assert result.getJobID() == nextJobID(); + + prevJobID = result.getJobID(); + queue.set(prevJobID, null); + + return result; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 38a1d7b8f..6aabc2c99 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -351,8 +351,7 @@ public class NanoScheduler { // a priority queue that stores up to bufferSize elements // produced by completed map jobs. - final PriorityBlockingQueue> mapResultQueue = - new PriorityBlockingQueue>(); + final MapResultsQueue mapResultQueue = new MapResultsQueue(); final Reducer reducer = new Reducer(reduce, errorTracker, initialValue); @@ -420,12 +419,12 @@ public class NanoScheduler { private class ReadMapReduceJob implements Runnable { final InputProducer inputProducer; - final PriorityBlockingQueue> mapResultQueue; + final MapResultsQueue mapResultQueue; final NSMapFunction map; final Reducer reducer; private ReadMapReduceJob(final InputProducer inputProducer, - final PriorityBlockingQueue> mapResultQueue, + final MapResultsQueue mapResultQueue, final NSMapFunction map, final Reducer reducer) { this.inputProducer = inputProducer; diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java index a7b94e323..66927d073 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java @@ -43,11 +43,6 @@ class Reducer { int numSubmittedJobs = UNSET_NUM_SUBMITTED_JOBS; // not yet set - /** - * The jobID of the last job we've seen - */ - int prevJobID = -1; // no jobs observed - /** * A counter keeping track of the number of jobs we're reduced */ @@ -72,26 +67,6 @@ class Reducer { this.sum = initialSum; } - /** - * Should we reduce the next value in the mapResultQueue? - * - * @param mapResultQueue the queue of map results - * @return true if we should reduce - */ - @Requires("mapResultQueue != null") - private synchronized boolean reduceNextValueInQueue(final PriorityBlockingQueue> mapResultQueue) { - final MapResult nextMapResult = mapResultQueue.peek(); - if ( nextMapResult == null ) { - return false; - } else if ( nextMapResult.getJobID() < prevJobID + 1 ) { - throw new IllegalStateException("Next job ID " + nextMapResult.getJobID() + " is not < previous job id " + prevJobID); - } else if ( nextMapResult.getJobID() == prevJobID + 1 ) { - return true; - } else { - return false; - } - } - /** * Reduce as much data as possible in mapResultQueue, returning the number of reduce calls completed * @@ -104,16 +79,20 @@ class Reducer { * @throws InterruptedException */ @Ensures("result >= 0") - public synchronized int reduceAsMuchAsPossible(final PriorityBlockingQueue> mapResultQueue) { + public synchronized int reduceAsMuchAsPossible(final MapResultsQueue mapResultQueue) { + // TODO -- have conditional lock acquistion. If the lock can be acquired, actually do some useful + // TODO -- work, but if it cannot just continue on your way. No sense in having all our map + // TODO -- threads block here just to fall through. The only question is if, with that locking scheme, + // TODO -- it's possible to leave some values in the map queue, as the thread owning the lock is + // TODO -- exiting and the only remaining thread to actually complete the reduce falls through. + // TODO -- all we really need to do is add a final call to reduceAsMuchAsPossible when exiting the nano scheduler + // TODO -- to make sure we've cleaned everything up if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null"); int nReducesNow = 0; -// if ( numSubmittedJobs != UNSET_NUM_SUBMITTED_JOBS ) -// logger.warn(" maybeReleaseLatch " + numJobsReduced + " numSubmittedJobs " + numSubmittedJobs + " queue " + mapResultQueue.size()); try { - while ( reduceNextValueInQueue(mapResultQueue) ) { + while ( mapResultQueue.nextValueIsAvailable() ) { final MapResult result = mapResultQueue.take(); - prevJobID = result.getJobID(); if ( ! result.isEOFMarker() ) { nReducesNow++; @@ -129,8 +108,6 @@ class Reducer { errorTracker.notifyOfError(ex); countDownLatch.countDown(); } -// if ( numSubmittedJobs == UNSET_NUM_SUBMITTED_JOBS ) -// logger.warn(" maybeReleaseLatch " + numJobsReduced + " numSubmittedJobs " + numSubmittedJobs + " queue " + mapResultQueue.size()); return nReducesNow; } @@ -176,10 +153,11 @@ class Reducer { public synchronized void setTotalJobCount(final int numOfSubmittedJobs) { if ( numOfSubmittedJobs < 0 ) throw new IllegalArgumentException("numOfSubmittedJobs must be >= 0, but saw " + numOfSubmittedJobs); + if ( numJobsReduced > numOfSubmittedJobs ) + throw new IllegalArgumentException("numOfSubmittedJobs " + numOfSubmittedJobs + " < numJobsReduced " + numJobsReduced); if ( this.numSubmittedJobs != UNSET_NUM_SUBMITTED_JOBS) throw new IllegalStateException("setlastJobID called multiple times, but should only be called once"); - //logger.warn("setTotalJobCount " + numJobsReduced + " numSubmitted " + numOfSubmittedJobs); this.numSubmittedJobs = numOfSubmittedJobs; maybeReleaseLatch(); } @@ -192,9 +170,7 @@ class Reducer { * @throws InterruptedException */ public ReduceType waitForFinalReduce() throws InterruptedException { - //logger.warn("waitForFinalReduce() " + numJobsReduced + " " + numSubmittedJobs); countDownLatch.await(); - //logger.warn(" done waitForFinalReduce"); return sum; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java index 6c17aa78d..6ea73f3cd 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java @@ -88,7 +88,7 @@ public class ReducerUnitTest extends BaseTest { if ( groupSize == -1 ) groupSize = allJobs.size(); - final PriorityBlockingQueue> mapResultsQueue = new PriorityBlockingQueue>(); + final MapResultsQueue mapResultsQueue = new MapResultsQueue(); final List>> jobGroups = Utils.groupList(allJobs, groupSize); final ReduceSumTest reduce = new ReduceSumTest(); @@ -98,6 +98,7 @@ public class ReducerUnitTest extends BaseTest { final ExecutorService es = Executors.newSingleThreadExecutor(); es.submit(waitingThread); + int lastJobID = -1; int nJobsSubmitted = 0; int jobGroupCount = 0; final int lastJobGroupCount = jobGroups.size() - 1; @@ -106,12 +107,13 @@ public class ReducerUnitTest extends BaseTest { for ( final List> jobs : jobGroups ) { //logger.warn("Processing job group " + jobGroupCount + " with " + jobs.size() + " jobs"); for ( final MapResult job : jobs ) { - mapResultsQueue.add(job); + lastJobID = Math.max(lastJobID, job.getJobID()); + mapResultsQueue.put(job); nJobsSubmitted++; } if ( jobGroupCount == lastJobGroupCount ) { - mapResultsQueue.add(new MapResult()); + mapResultsQueue.put(new MapResult(lastJobID+1)); nJobsSubmitted++; } @@ -119,7 +121,7 @@ public class ReducerUnitTest extends BaseTest { if ( jobGroupCount == 0 && setJobIDAtStart ) { // only can do the setJobID if jobs cannot be submitted out of order - reducer.setTotalJobCount(allJobs.size()); + reducer.setTotalJobCount(allJobs.size()+1); Assert.assertFalse(reducer.latchIsReleased(), "Latch should be closed even after setting last job if we haven't processed anything"); } @@ -141,7 +143,7 @@ public class ReducerUnitTest extends BaseTest { Assert.assertTrue(reducer.latchIsReleased(), "Latch should be released after reducing with last job id being set"); else { Assert.assertFalse(reducer.latchIsReleased(), "Latch should be closed after reducing without last job id being set"); - reducer.setTotalJobCount(allJobs.size()); + reducer.setTotalJobCount(allJobs.size() + 1); Assert.assertTrue(reducer.latchIsReleased(), "Latch should be released after reducing after setting last job id "); } From 7796ba7601689ea864934853e4509d2abd4dae9a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 20 Dec 2012 12:34:05 -0500 Subject: [PATCH 07/22] Minor optimizations for NanoScheduler -- Reducer.maybeReleaseLatch is no longer synchronized -- NanoScheduler only prints progress every 100 or so map calls --- .../utils/nanoScheduler/NanoScheduler.java | 18 +++++++++++++----- .../sting/utils/nanoScheduler/Reducer.java | 2 +- .../nanoScheduler/NanoSchedulerUnitTest.java | 2 +- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index 6aabc2c99..d8325f83e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -43,7 +43,7 @@ import java.util.concurrent.*; public class NanoScheduler { private final static Logger logger = Logger.getLogger(NanoScheduler.class); private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; - private final static boolean LOG_MAP_TIMES = false; + protected final static int UPDATE_PROGRESS_FREQ = 100; final int bufferSize; final int nThreads; @@ -243,8 +243,7 @@ public class NanoScheduler { // map final MapType mapValue = map.apply(input); - if ( progressFunction != null ) - progressFunction.progress(input); + updateProgress(i++, input); // reduce sum = reduce.apply(mapValue, sum); @@ -254,6 +253,16 @@ public class NanoScheduler { return sum; } + /** + * Maybe update the progress meter (maybe because we don't want to do so so often that it costs cpu time) + * @param counter increasing counter to use to cut down on updates + * @param input the input we're currently at + */ + private void updateProgress(final int counter, final InputType input) { + if ( progressFunction != null && counter % UPDATE_PROGRESS_FREQ == 0 ) + progressFunction.progress(input); + } + /** * Efficient parallel version of Map/Reduce * @@ -453,8 +462,7 @@ public class NanoScheduler { // enqueue the result into the mapResultQueue result = new MapResult(mapValue, inputWrapper.getId()); - if ( progressFunction != null ) - progressFunction.progress(input); + updateProgress(inputWrapper.getId(), input); } else { // if there's no input we push empty MapResults with jobIDs for synchronization with Reducer result = new MapResult(inputWrapper.getId()); diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java index 66927d073..a3d3f9056 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java @@ -117,7 +117,7 @@ class Reducer { * * Appropriate means we've seen the last job, or there's only a single job id */ - private synchronized void maybeReleaseLatch() { + private void maybeReleaseLatch() { if ( numJobsReduced == numSubmittedJobs ) { // either we've already seen the last one prevJobID == numSubmittedJobs or // the last job ID is -1, meaning that no jobs were ever submitted diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java index 61e8ec0a1..52cd904db 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -101,7 +101,7 @@ public class NanoSchedulerUnitTest extends BaseTest { public int nExpectedCallbacks() { int nElements = Math.max(end - start, 0); - return nElements / bufferSize; + return nElements / bufferSize / NanoScheduler.UPDATE_PROGRESS_FREQ; } public Map2x makeMap() { return addDelays ? new Map2xWithDelays() : new Map2x(); } From 14944b5d730c52d73765c9de41d8d157be2e6418 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 20 Dec 2012 14:53:58 -0500 Subject: [PATCH 08/22] Incorporating clover into build.xml -- See http://gatkforums.broadinstitute.org/discussion/2002/clover-coverage-analysis-with-ant for use docs -- Fix for artificial reads not having proper read groups, causing NPE in some tests -- Added clover itself to private/resources --- build.xml | 50 +++++++++++++------ .../sting/gatk/phonehome/GATKRunReport.java | 4 +- .../sting/utils/sam/ArtificialSAMUtils.java | 1 + 3 files changed, 38 insertions(+), 17 deletions(-) diff --git a/build.xml b/build.xml index 49efd616d..0cfc1a8cd 100644 --- a/build.xml +++ b/build.xml @@ -107,7 +107,7 @@ - + @@ -267,19 +267,19 @@ - - + + + + + + - - + + + + + + @@ -596,6 +596,7 @@ + + @@ -1090,7 +1092,6 @@ - @@ -1119,8 +1120,12 @@ + + + + @@ -1128,6 +1133,17 @@ + + + + + + + + + + + @@ -1207,6 +1223,7 @@ + @@ -1220,10 +1237,11 @@ listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter"> + - + @@ -1262,6 +1280,7 @@ + @@ -1374,6 +1393,7 @@ + diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index 51fed470f..2bc14aa69 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -131,7 +131,7 @@ public class GATKRunReport { private String hostName; @Element(required = true, name = "java") - private String java; + private String javaVersion; @Element(required = true, name = "machine") private String machine; @@ -212,7 +212,7 @@ public class GATKRunReport { hostName = Utils.resolveHostname(); // basic java information - java = Utils.join("-", Arrays.asList(System.getProperty("java.vendor"), System.getProperty("java.version"))); + javaVersion = Utils.join("-", Arrays.asList(System.getProperty("java.vendor"), System.getProperty("java.version"))); machine = Utils.join("-", Arrays.asList(System.getProperty("os.name"), System.getProperty("os.arch"))); // if there was an exception, capture it diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index 0859957a3..77cf500f2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -188,6 +188,7 @@ public class ArtificialSAMUtils { GATKSAMRecord rec = createArtificialRead(header, name, refIndex, alignmentStart, bases.length); rec.setReadBases(bases); rec.setBaseQualities(qual); + rec.setReadGroup(new GATKSAMReadGroupRecord("x")); if (refIndex == -1) { rec.setReadUnmappedFlag(true); } From 940816f16a6de33c35789ec73f5de02618b647f6 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 21 Dec 2012 10:50:56 -0500 Subject: [PATCH 09/22] GATKSamRecord now checks that the read group is a GATKReadGroupRecord, and if not makes one --- .../src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index ebb3c1ad0..cf06c59e8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -154,7 +154,8 @@ public class GATKSAMRecord extends BAMRecord { @Override public GATKSAMReadGroupRecord getReadGroup() { if ( ! retrievedReadGroup ) { - mReadGroup = (GATKSAMReadGroupRecord)super.getReadGroup(); + final SAMReadGroupRecord rg = super.getReadGroup(); + mReadGroup = rg instanceof GATKSAMReadGroupRecord ? (GATKSAMReadGroupRecord)rg : new GATKSAMReadGroupRecord(rg); retrievedReadGroup = true; } return mReadGroup; From 161487b4a4cc63893b3062e7312ff82263fcb233 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 21 Dec 2012 10:51:24 -0500 Subject: [PATCH 10/22] MapResult compareTo() is now unit tested -- Thanks clover! --- .../nanoScheduler/MapResultUnitTest.java | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java new file mode 100644 index 000000000..93fe9578f --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/MapResultUnitTest.java @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** +* UnitTests for the InputProducer +* +* User: depristo +* Date: 8/24/12 +* Time: 11:25 AM +* To change this template use File | Settings | File Templates. +*/ +public class MapResultUnitTest { + @DataProvider(name = "CompareTester") + public Object[][] createCompareTester() { + List tests = new ArrayList(); + + for ( int id1 = 0; id1 < 10; id1++ ) { + for ( int id2 = 0; id2 < 10; id2++ ) { + tests.add(new Object[]{ id1, id2, Integer.valueOf(id1).compareTo(id2)}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "CompareTester") + public void testInputProducer(final int id1, final int id2, final int comp ) throws InterruptedException { + final MapResult mr1 = new MapResult(id1, id1); + final MapResult mr2 = new MapResult(id2, id2); + Assert.assertEquals(mr1.compareTo(mr2), comp, "Compare MapResultsUnitTest failed"); + } +} From bf81db40f728307d6856154f1b00396f54026e9b Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 22 Dec 2012 11:49:30 -0500 Subject: [PATCH 11/22] NanoScheduler reducer optimizations -- reduceAsMuchAsPossible no longer blocks threads via synchronization, but instead uses an explicit lock to manage access. If the lock is already held (because some thread is doing reduce) then the thread attempting to reduce immediately exits the call and continues doing productive work. They removes one major source of blocking contention in the NanoScheduler --- .../utils/nanoScheduler/NanoScheduler.java | 21 +++--- .../sting/utils/nanoScheduler/Reducer.java | 69 ++++++++++++------ .../utils/nanoScheduler/ReducerUnitTest.java | 70 ++++++++++++++++--- 3 files changed, 121 insertions(+), 39 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index d8325f83e..d3b8c8149 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -381,7 +381,7 @@ public class NanoScheduler { reducer.setTotalJobCount(nSubmittedJobs); // wait for all of the input and map threads to finish - return waitForCompletion(inputProducer, reducer); + return waitForCompletion(mapResultQueue, reducer); } catch (Throwable ex) { // logger.warn("Reduce job got exception " + ex); errorTracker.notifyOfError(ex); @@ -392,17 +392,21 @@ public class NanoScheduler { /** * Wait until the input thread and all map threads have completed running, and return the final reduce result */ - private ReduceType waitForCompletion(final InputProducer inputProducer, + private ReduceType waitForCompletion(final MapResultsQueue mapResultsQueue, final Reducer reducer) throws InterruptedException { - // wait until we have a final reduce result -// logger.warn("waiting for final reduce"); - final ReduceType finalSum = reducer.waitForFinalReduce(); - // wait for all the map threads to finish by acquiring and then releasing all map job semaphores -// logger.warn("waiting on map"); runningMapJobSlots.acquire(bufferSize); runningMapJobSlots.release(bufferSize); + // do a final reduce here. This is critically important because the InputMapReduce jobs + // no longer block on reducing, so it's possible for all the threads to end with a few + // reduce jobs on the queue still to do. This call ensures that we reduce everything + reducer.reduceAsMuchAsPossible(mapResultsQueue, true); + + // wait until we have a final reduce result + final ReduceType finalSum = reducer.waitForFinalReduce(); + + // everything is finally shutdown, return the final reduce value return finalSum; } @@ -470,7 +474,8 @@ public class NanoScheduler { mapResultQueue.put(result); - final int nReduced = reducer.reduceAsMuchAsPossible(mapResultQueue); + // reduce as much as possible, without blocking, if another thread is already doing reduces + final int nReduced = reducer.reduceAsMuchAsPossible(mapResultQueue, false); } catch (Throwable ex) { errorTracker.notifyOfError(ex); } finally { diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java index a3d3f9056..25e8b1fe6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java @@ -1,12 +1,12 @@ package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; import java.util.concurrent.CountDownLatch; -import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; /** * Reducer supporting two-threaded reduce of the map/reduce. @@ -31,9 +31,10 @@ class Reducer { private final static Logger logger = Logger.getLogger(Reducer.class); private final static int UNSET_NUM_SUBMITTED_JOBS = -2; - final CountDownLatch countDownLatch = new CountDownLatch(1); - final NSReduceFunction reduce; - final MultiThreadedErrorTracker errorTracker; + private final CountDownLatch countDownLatch = new CountDownLatch(1); + private final NSReduceFunction reduce; + private final MultiThreadedErrorTracker errorTracker; + private final Lock reduceLock = new ReentrantLock(); /** * The sum of the reduce function applied to all MapResults. After this Reducer @@ -79,39 +80,63 @@ class Reducer { * @throws InterruptedException */ @Ensures("result >= 0") - public synchronized int reduceAsMuchAsPossible(final MapResultsQueue mapResultQueue) { - // TODO -- have conditional lock acquistion. If the lock can be acquired, actually do some useful - // TODO -- work, but if it cannot just continue on your way. No sense in having all our map - // TODO -- threads block here just to fall through. The only question is if, with that locking scheme, - // TODO -- it's possible to leave some values in the map queue, as the thread owning the lock is - // TODO -- exiting and the only remaining thread to actually complete the reduce falls through. - // TODO -- all we really need to do is add a final call to reduceAsMuchAsPossible when exiting the nano scheduler - // TODO -- to make sure we've cleaned everything up + public int reduceAsMuchAsPossible(final MapResultsQueue mapResultQueue, final boolean waitForLock) { if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null"); int nReducesNow = 0; + final boolean haveLock = acquireReduceLock(waitForLock); try { - while ( mapResultQueue.nextValueIsAvailable() ) { - final MapResult result = mapResultQueue.take(); + if ( haveLock ) { + while ( mapResultQueue.nextValueIsAvailable() ) { + final MapResult result = mapResultQueue.take(); - if ( ! result.isEOFMarker() ) { - nReducesNow++; + if ( ! result.isEOFMarker() ) { + nReducesNow++; - // apply reduce, keeping track of sum - sum = reduce.apply(result.getValue(), sum); + // apply reduce, keeping track of sum + sum = reduce.apply(result.getValue(), sum); + } + + numJobsReduced++; + maybeReleaseLatch(); } - - numJobsReduced++; - maybeReleaseLatch(); } } catch (Exception ex) { errorTracker.notifyOfError(ex); countDownLatch.countDown(); + } finally { + if ( haveLock ) // if we acquired the lock, unlock it + releaseReduceLock(); } return nReducesNow; } + /** + * Acquire the reduce lock, either returning immediately if not possible or blocking until the lock is available + * + * @param blockUntilAvailable if true, we will block until the lock is available, otherwise we return immediately + * without acquiring the lock + * @return true if the lock has been acquired, false otherwise + */ + protected boolean acquireReduceLock(final boolean blockUntilAvailable) { + if ( blockUntilAvailable ) { + reduceLock.lock(); + return true; + } else { + return reduceLock.tryLock(); + } + } + + /** + * Free the reduce lock. + * + * Assumes that the invoking thread actually previously acquired the lock (it's a problem if not). + */ + protected void releaseReduceLock() { + reduceLock.unlock(); + } + /** * release the latch if appropriate * diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java index 6ea73f3cd..5db5acde0 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java @@ -11,10 +11,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.List; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.PriorityBlockingQueue; -import java.util.concurrent.TimeUnit; +import java.util.concurrent.*; /** * UnitTests for Reducer @@ -125,7 +122,7 @@ public class ReducerUnitTest extends BaseTest { Assert.assertFalse(reducer.latchIsReleased(), "Latch should be closed even after setting last job if we haven't processed anything"); } - final int nReduced = reducer.reduceAsMuchAsPossible(mapResultsQueue); + final int nReduced = reducer.reduceAsMuchAsPossible(mapResultsQueue, true); Assert.assertTrue(nReduced <= nJobsSubmitted, "Somehow reduced more jobs than submitted"); if ( setJobIDAtStart ) { @@ -152,16 +149,71 @@ public class ReducerUnitTest extends BaseTest { es.awaitTermination(1, TimeUnit.HOURS); } - @Test(expectedExceptions = IllegalStateException.class) + @Test(enabled = true, expectedExceptions = IllegalStateException.class) private void runSettingJobIDTwice() throws Exception { - final PriorityBlockingQueue> mapResultsQueue = new PriorityBlockingQueue>(); - final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), 0); - reducer.setTotalJobCount(10); reducer.setTotalJobCount(15); } + @Test(timeOut = 1000, invocationCount = 100) + private void testNonBlockingReduce() throws Exception { + final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), 0); + final MapResultsQueue mapResultsQueue = new MapResultsQueue(); + mapResultsQueue.put(new MapResult(0, 0)); + mapResultsQueue.put(new MapResult(1, 1)); + + final CountDownLatch latch = new CountDownLatch(1); + final ExecutorService es = Executors.newSingleThreadExecutor(); + + es.submit(new Runnable() { + @Override + public void run() { + reducer.acquireReduceLock(true); + latch.countDown(); + } + }); + + latch.await(); + final int nReduced = reducer.reduceAsMuchAsPossible(mapResultsQueue, false); + Assert.assertEquals(nReduced, 0, "The reducer lock was already held but we did some work"); + es.shutdown(); + es.awaitTermination(1, TimeUnit.HOURS); + } + + @Test(timeOut = 10000, invocationCount = 100) + private void testBlockingReduce() throws Exception { + final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), 0); + final MapResultsQueue mapResultsQueue = new MapResultsQueue(); + mapResultsQueue.put(new MapResult(0, 0)); + mapResultsQueue.put(new MapResult(1, 1)); + + final CountDownLatch latch = new CountDownLatch(1); + final ExecutorService es = Executors.newSingleThreadExecutor(); + + es.submit(new Runnable() { + @Override + public void run() { + reducer.acquireReduceLock(true); + latch.countDown(); + try { + Thread.sleep(100); + } catch ( InterruptedException e ) { + ; + } finally { + reducer.releaseReduceLock(); + } + } + }); + + latch.await(); + final int nReduced = reducer.reduceAsMuchAsPossible(mapResultsQueue, true); + Assert.assertEquals(nReduced, 2, "The reducer should have blocked until the lock was freed and reduced 2 values"); + es.shutdown(); + es.awaitTermination(1, TimeUnit.HOURS); + } + + public class ReduceSumTest implements NSReduceFunction { int nRead = 0; int lastValue = -1; From aa3ee299294a4b35b0ef5812360adb463739a6b9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sat, 22 Dec 2012 13:13:01 -0500 Subject: [PATCH 12/22] Handle case where the ReadGroup is null in GATKSAMRecord --- .../broadinstitute/sting/utils/sam/GATKSAMRecord.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index cf06c59e8..beadead0a 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -155,7 +155,16 @@ public class GATKSAMRecord extends BAMRecord { public GATKSAMReadGroupRecord getReadGroup() { if ( ! retrievedReadGroup ) { final SAMReadGroupRecord rg = super.getReadGroup(); - mReadGroup = rg instanceof GATKSAMReadGroupRecord ? (GATKSAMReadGroupRecord)rg : new GATKSAMReadGroupRecord(rg); + + // three cases: rg may be null (no rg, rg may already be a GATKSAMReadGroupRecord, or it may be + // a regular SAMReadGroupRecord in which case we have to make it a GATKSAMReadGroupRecord + if ( rg == null ) + mReadGroup = null; + else if ( rg instanceof GATKSAMReadGroupRecord ) + mReadGroup = (GATKSAMReadGroupRecord)rg; + else + mReadGroup = new GATKSAMReadGroupRecord(rg); + retrievedReadGroup = true; } return mReadGroup; From 295455eee248fc1932e49b7e11f1f84ff743216f Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 23 Dec 2012 08:53:49 -0500 Subject: [PATCH 13/22] NanoScheduler optimizations and simplification -- The previous model was to enqueue individual map jobs (with a resolution of 1 map job per map call), to track the number of map calls submitted via a counter and a semaphore, and to use this information in each map job and reduce to control the number of map jobs, when reduce was complete, etc. All hideously complex. -- This new model is vastly simply. The reducer basically knows nothing about the control mechanisms in the NanoScheduler. It just supports multi-threaded reduce. The NanoScheduler enqueues exactly nThread jobs to be run, which continually loop reading, mapping, and reducing until they run out of material to read, when they shut down. The master thread of the NS just holds a CountDownLatch, initialized to nThreads, and when each thread exits it reduces the latch by 1. The master thread gets the final reduce result when its free by the latch reaching 0. It's all super super simple. -- Because this model uses vastly fewer synchronization primitives within the NS itself, it's naturally much faster at getting things done, without any of the overhead obvious in profiles of BQSR -nct 2. --- .../utils/nanoScheduler/EOFMarkedValue.java | 25 +++ .../utils/nanoScheduler/InputProducer.java | 25 +++ .../sting/utils/nanoScheduler/MapResult.java | 25 +++ .../utils/nanoScheduler/NSMapFunction.java | 25 +++ .../nanoScheduler/NSProgressFunction.java | 25 +++ .../utils/nanoScheduler/NSReduceFunction.java | 25 +++ .../utils/nanoScheduler/NanoScheduler.java | 143 +++++++++--------- .../sting/utils/nanoScheduler/Reducer.java | 134 +++++++--------- .../utils/nanoScheduler/ReducerUnitTest.java | 64 ++------ 9 files changed, 287 insertions(+), 204 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java index d0ad51cb0..464ebfcd5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.nanoScheduler; /** diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java index ee5c46642..0ccb2b8cc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.nanoScheduler; import org.apache.log4j.Logger; diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java index 4544f376c..fd23b11d8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.nanoScheduler; /** diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java index cc5335051..1311126d0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.nanoScheduler; /** diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java index 8b12c62c4..785a7f4fd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.nanoScheduler; /** diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java index 879a33a1d..8191b16c9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.nanoScheduler; /** diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java index d3b8c8149..c3854eef2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -1,3 +1,28 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; @@ -45,11 +70,18 @@ public class NanoScheduler { private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; protected final static int UPDATE_PROGRESS_FREQ = 100; + /** + * Currently not used, but kept because it's conceptual reasonable to have a buffer + */ final int bufferSize; + + /** + * The number of threads we're using to execute the map jobs in this nano scheduler + */ final int nThreads; + final ExecutorService masterExecutor; final ExecutorService mapExecutor; - final Semaphore runningMapJobSlots; final MultiThreadedErrorTracker errorTracker = new MultiThreadedErrorTracker(); boolean shutdown = false; @@ -75,11 +107,9 @@ public class NanoScheduler { if ( nThreads == 1 ) { this.mapExecutor = this.masterExecutor = null; - runningMapJobSlots = null; } else { this.masterExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-master-thread-%d")); this.mapExecutor = Executors.newFixedThreadPool(nThreads, new NamedThreadFactory("NS-map-thread-%d")); - runningMapJobSlots = new Semaphore(this.bufferSize); } } @@ -358,32 +388,23 @@ public class NanoScheduler { // Create the input producer and start it running final InputProducer inputProducer = new InputProducer(inputReader); - // a priority queue that stores up to bufferSize elements - // produced by completed map jobs. + // create the MapResultsQueue to store results of map jobs. final MapResultsQueue mapResultQueue = new MapResultsQueue(); - final Reducer reducer - = new Reducer(reduce, errorTracker, initialValue); + // create the reducer we'll use for this nano scheduling run + final Reducer reducer = new Reducer(reduce, errorTracker, initialValue); + + final CountDownLatch runningMapJobs = new CountDownLatch(nThreads); try { - int nSubmittedJobs = 0; - - while ( continueToSubmitJobs(nSubmittedJobs, inputProducer) ) { - // acquire a slot to run a map job. Blocks if too many jobs are enqueued - runningMapJobSlots.acquire(); - - mapExecutor.submit(new ReadMapReduceJob(inputProducer, mapResultQueue, map, reducer)); - nSubmittedJobs++; + // create and submit the info needed by the read/map/reduce threads to do their work + for ( int i = 0; i < nThreads; i++ ) { + mapExecutor.submit(new ReadMapReduceJob(inputProducer, mapResultQueue, runningMapJobs, map, reducer)); } - // mark the last job id we've submitted so we now the id to wait for - //logger.warn("setting jobs submitted to " + nSubmittedJobs); - reducer.setTotalJobCount(nSubmittedJobs); - // wait for all of the input and map threads to finish - return waitForCompletion(mapResultQueue, reducer); + return waitForCompletion(mapResultQueue, runningMapJobs, reducer); } catch (Throwable ex) { -// logger.warn("Reduce job got exception " + ex); errorTracker.notifyOfError(ex); return initialValue; } @@ -393,10 +414,10 @@ public class NanoScheduler { * Wait until the input thread and all map threads have completed running, and return the final reduce result */ private ReduceType waitForCompletion(final MapResultsQueue mapResultsQueue, + final CountDownLatch runningMapJobs, final Reducer reducer) throws InterruptedException { - // wait for all the map threads to finish by acquiring and then releasing all map job semaphores - runningMapJobSlots.acquire(bufferSize); - runningMapJobSlots.release(bufferSize); + // wait for all the map threads to finish by waiting on the runningMapJobs latch + runningMapJobs.await(); // do a final reduce here. This is critically important because the InputMapReduce jobs // no longer block on reducing, so it's possible for all the threads to end with a few @@ -404,30 +425,11 @@ public class NanoScheduler { reducer.reduceAsMuchAsPossible(mapResultsQueue, true); // wait until we have a final reduce result - final ReduceType finalSum = reducer.waitForFinalReduce(); - + final ReduceType finalSum = reducer.getReduceResult(); // everything is finally shutdown, return the final reduce value return finalSum; } - - /** - * Should we continue to submit jobs given the number of jobs already submitted and the - * number of read items in inputProducer? - * - * We continue to submit jobs while inputProducer hasn't reached EOF or the number - * of jobs we've enqueued isn't the number of read elements. This means that in - * some cases we submit more jobs than total read elements (cannot know because of - * multi-threading) so map jobs must handle the case where getNext() returns EOF. - * - * @param nJobsSubmitted - * @param inputProducer - * @return - */ - private boolean continueToSubmitJobs(final int nJobsSubmitted, final InputProducer inputProducer) { - final int nReadItems = inputProducer.getNumInputValues(); - return nReadItems == -1 || nJobsSubmitted < nReadItems; - } } private class ReadMapReduceJob implements Runnable { @@ -435,13 +437,16 @@ public class NanoScheduler { final MapResultsQueue mapResultQueue; final NSMapFunction map; final Reducer reducer; + final CountDownLatch runningMapJobs; private ReadMapReduceJob(final InputProducer inputProducer, final MapResultsQueue mapResultQueue, + final CountDownLatch runningMapJobs, final NSMapFunction map, final Reducer reducer) { this.inputProducer = inputProducer; this.mapResultQueue = mapResultQueue; + this.runningMapJobs = runningMapJobs; this.map = map; this.reducer = reducer; } @@ -449,39 +454,41 @@ public class NanoScheduler { @Override public void run() { try { - // get the next item from the input producer - final InputProducer.InputValue inputWrapper = inputProducer.next(); + boolean done = false; + while ( ! done ) { + // get the next item from the input producer + final InputProducer.InputValue inputWrapper = inputProducer.next(); - // depending on inputWrapper, actually do some work or not, putting result input result object - final MapResult result; - if ( ! inputWrapper.isEOFMarker() ) { - // just skip doing anything if we don't have work to do, which is possible - // because we don't necessarily know how much input there is when we queue - // up our jobs - final InputType input = inputWrapper.getValue(); + // depending on inputWrapper, actually do some work or not, putting result input result object + final MapResult result; + if ( ! inputWrapper.isEOFMarker() ) { + // just skip doing anything if we don't have work to do, which is possible + // because we don't necessarily know how much input there is when we queue + // up our jobs + final InputType input = inputWrapper.getValue(); - // map - final MapType mapValue = map.apply(input); + // actually execute the map + final MapType mapValue = map.apply(input); - // enqueue the result into the mapResultQueue - result = new MapResult(mapValue, inputWrapper.getId()); + // enqueue the result into the mapResultQueue + result = new MapResult(mapValue, inputWrapper.getId()); - updateProgress(inputWrapper.getId(), input); - } else { - // if there's no input we push empty MapResults with jobIDs for synchronization with Reducer - result = new MapResult(inputWrapper.getId()); + mapResultQueue.put(result); + + // reduce as much as possible, without blocking, if another thread is already doing reduces + final int nReduced = reducer.reduceAsMuchAsPossible(mapResultQueue, false); + + updateProgress(inputWrapper.getId(), input); + } else { + done = true; + } } - - mapResultQueue.put(result); - - // reduce as much as possible, without blocking, if another thread is already doing reduces - final int nReduced = reducer.reduceAsMuchAsPossible(mapResultQueue, false); } catch (Throwable ex) { errorTracker.notifyOfError(ex); } finally { // we finished a map job, release the job queue semaphore - runningMapJobSlots.release(); + runningMapJobs.countDown(); } } } -} +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java index 25e8b1fe6..294065838 100644 --- a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java @@ -1,39 +1,67 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.utils.nanoScheduler; import com.google.java.contract.Ensures; import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; -import java.util.concurrent.CountDownLatch; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; /** - * Reducer supporting two-threaded reduce of the map/reduce. + * Reducer supporting multi-threaded reduce of the map/reduce. * - * The first thread, using the reduceAsMuchAsPossible function, actually reduces the data - * as it arrives in the blockingQueue. + * reduceAsMuchAsPossible is the key function. Multiple threads can call into this, providing + * the map results queue, and this class accumulates the result of calling reduce + * on the maps objects. reduceAsMuchAsPossible isn't directly synchronized, but manages multi-threading + * directly with a lock. Threads can request either to block on the reduce call until it can be + * executed, or immediately exit if the lock isn't available. That allows multi-threaded users + * to avoid piling up waiting to reduce while one thread is reducing. They can instead immediately + * leave to go do something else productive * - * The second thread, using the waitForFinalReduce, can block on this data structure - * until that all jobs have arrived and been reduced. - * - * The key function for communication here is setTotalJobCount(), which the thread that submits - * jobs that enqueue MapResults into the blocking queue must call ONCE to tell the - * Reducer the total number of jobs that have been submitted for map. When numOfSubmittedJobs - * have been processed, this class frees a latch that allows thread blocked on waitForFinalReduce to proceed. - * - * This thread reads from mapResultsQueue until the poison EOF object arrives. At each - * stage is calls reduce(value, sum). The blocking mapResultQueue ensures that the - * queue waits until the mapResultQueue has a value to take. Then, it gets and waits - * until the map result Future has a value. + * @author depristo + * @since 2012 */ class Reducer { private final static Logger logger = Logger.getLogger(Reducer.class); - private final static int UNSET_NUM_SUBMITTED_JOBS = -2; - private final CountDownLatch countDownLatch = new CountDownLatch(1); + /** + * The reduce function to execute + */ private final NSReduceFunction reduce; + + /** + * Used to communicate errors to the outer master thread + */ private final MultiThreadedErrorTracker errorTracker; + + /** + * Lock used to protect the call reduceAsMuchAsPossible from race conditions + */ private final Lock reduceLock = new ReentrantLock(); /** @@ -42,13 +70,6 @@ class Reducer { */ ReduceType sum; - int numSubmittedJobs = UNSET_NUM_SUBMITTED_JOBS; // not yet set - - /** - * A counter keeping track of the number of jobs we're reduced - */ - int numJobsReduced = 0; - /** * Create a new Reducer that will apply the reduce function with initialSum value * to values via reduceAsMuchAsPossible, timing the reduce function call costs with @@ -96,14 +117,10 @@ class Reducer { // apply reduce, keeping track of sum sum = reduce.apply(result.getValue(), sum); } - - numJobsReduced++; - maybeReleaseLatch(); } } } catch (Exception ex) { errorTracker.notifyOfError(ex); - countDownLatch.countDown(); } finally { if ( haveLock ) // if we acquired the lock, unlock it releaseReduceLock(); @@ -138,64 +155,15 @@ class Reducer { } /** - * release the latch if appropriate + * Get the current reduce result resulting from applying reduce(...) to all MapResult elements. * - * Appropriate means we've seen the last job, or there's only a single job id - */ - private void maybeReleaseLatch() { - if ( numJobsReduced == numSubmittedJobs ) { - // either we've already seen the last one prevJobID == numSubmittedJobs or - // the last job ID is -1, meaning that no jobs were ever submitted - countDownLatch.countDown(); - } - } - - /** - * For testing only - * - * @return true if latch is released - */ - protected synchronized boolean latchIsReleased() { - return countDownLatch.getCount() == 0; - } - - /** - * Key function: tell this class the total number of jobs will provide data in the mapResultsQueue - * - * The total job count when we free threads blocked on waitForFinalReduce. When we see numOfSubmittedJobs - * MapResults from the queue, those threads are released. - * - * Until this function is called, those thread will block forever. The numOfSubmittedJobs has a few constraints. - * First, it must be >= 0. 0 indicates that in fact no jobs will ever be submitted (i.e., there's no - * data coming) so the latch should be opened immediately. If it's >= 1, we will wait until - * we see numOfSubmittedJobs jobs before freeing them. - * - * Note that we throw an IllegalStateException if this function is called twice. - * - * @param numOfSubmittedJobs int >= 0 indicating the total number of MapResults that will - * enqueue results into our queue - */ - public synchronized void setTotalJobCount(final int numOfSubmittedJobs) { - if ( numOfSubmittedJobs < 0 ) - throw new IllegalArgumentException("numOfSubmittedJobs must be >= 0, but saw " + numOfSubmittedJobs); - if ( numJobsReduced > numOfSubmittedJobs ) - throw new IllegalArgumentException("numOfSubmittedJobs " + numOfSubmittedJobs + " < numJobsReduced " + numJobsReduced); - if ( this.numSubmittedJobs != UNSET_NUM_SUBMITTED_JOBS) - throw new IllegalStateException("setlastJobID called multiple times, but should only be called once"); - - this.numSubmittedJobs = numOfSubmittedJobs; - maybeReleaseLatch(); - } - - /** - * Block until the last job has submitted its MapResult to our queue, and we've reduced it, and - * return the reduce result resulting from applying reduce(...) to all MapResult elements. + * Note that this method cannot know if future reduce calls are coming in. So it simply gets + * the current reduce result. It is up to the caller to know whether the returned value is + * a partial result, or the full final value * * @return the total reduce result across all jobs - * @throws InterruptedException */ - public ReduceType waitForFinalReduce() throws InterruptedException { - countDownLatch.await(); + public ReduceType getReduceResult() { return sum; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java index 5db5acde0..4fd875c0e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java @@ -27,19 +27,17 @@ public class ReducerUnitTest extends BaseTest { List tests = new ArrayList(); for ( final int groupSize : Arrays.asList(-1, 1, 5, 50, 500, 5000, 50000) ) { - for ( final boolean setJobIDAtStart : Arrays.asList(true, false) ) { - for ( final int nElements : Arrays.asList(0, 1, 3, 5) ) { - if ( groupSize < nElements ) { - for ( final List> jobs : Utils.makePermutations(makeJobs(nElements), nElements, false) ) { - tests.add(new Object[]{ new ListOfJobs(jobs), setJobIDAtStart, groupSize }); - } + for ( final int nElements : Arrays.asList(0, 1, 3, 5) ) { + if ( groupSize < nElements ) { + for ( final List> jobs : Utils.makePermutations(makeJobs(nElements), nElements, false) ) { + tests.add(new Object[]{ new ListOfJobs(jobs), groupSize }); } } + } - for ( final int nElements : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) { - if ( groupSize < nElements ) { - tests.add(new Object[]{ new ListOfJobs(makeJobs(nElements)), setJobIDAtStart, groupSize }); - } + for ( final int nElements : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) { + if ( groupSize < nElements ) { + tests.add(new Object[]{ new ListOfJobs(makeJobs(nElements)), groupSize }); } } } @@ -77,11 +75,7 @@ public class ReducerUnitTest extends BaseTest { } @Test(enabled = true, dataProvider = "ReducerThreadTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) - public void testReducerThread(final List> jobs, final boolean setJobIDAtStart, final int groupSize) throws Exception { - runTests(jobs, setJobIDAtStart, groupSize); - } - - private void runTests( final List> allJobs, boolean setJobIDAtStart, int groupSize ) throws Exception { + public void testReducerThread(final List> allJobs, int groupSize) throws Exception { if ( groupSize == -1 ) groupSize = allJobs.size(); @@ -99,7 +93,6 @@ public class ReducerUnitTest extends BaseTest { int nJobsSubmitted = 0; int jobGroupCount = 0; final int lastJobGroupCount = jobGroups.size() - 1; - setJobIDAtStart = setJobIDAtStart && groupSize == 1; for ( final List> jobs : jobGroups ) { //logger.warn("Processing job group " + jobGroupCount + " with " + jobs.size() + " jobs"); @@ -114,48 +107,17 @@ public class ReducerUnitTest extends BaseTest { nJobsSubmitted++; } - Assert.assertFalse(reducer.latchIsReleased(), "Latch should be closed at the start"); - - if ( jobGroupCount == 0 && setJobIDAtStart ) { - // only can do the setJobID if jobs cannot be submitted out of order - reducer.setTotalJobCount(allJobs.size()+1); - Assert.assertFalse(reducer.latchIsReleased(), "Latch should be closed even after setting last job if we haven't processed anything"); - } - final int nReduced = reducer.reduceAsMuchAsPossible(mapResultsQueue, true); Assert.assertTrue(nReduced <= nJobsSubmitted, "Somehow reduced more jobs than submitted"); - if ( setJobIDAtStart ) { - final boolean submittedLastJob = jobGroupCount == lastJobGroupCount; - Assert.assertEquals(reducer.latchIsReleased(), submittedLastJob, - "When last job is set, latch should only be released if the last job has been submitted"); - } else { - Assert.assertEquals(reducer.latchIsReleased(), false, "When last job isn't set, latch should never be release"); - } - jobGroupCount++; } - if ( setJobIDAtStart ) - Assert.assertTrue(reducer.latchIsReleased(), "Latch should be released after reducing with last job id being set"); - else { - Assert.assertFalse(reducer.latchIsReleased(), "Latch should be closed after reducing without last job id being set"); - reducer.setTotalJobCount(allJobs.size() + 1); - Assert.assertTrue(reducer.latchIsReleased(), "Latch should be released after reducing after setting last job id "); - } - Assert.assertEquals(reduce.nRead, allJobs.size(), "number of read values not all of the values in the reducer queue"); es.shutdown(); es.awaitTermination(1, TimeUnit.HOURS); } - @Test(enabled = true, expectedExceptions = IllegalStateException.class) - private void runSettingJobIDTwice() throws Exception { - final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), 0); - reducer.setTotalJobCount(10); - reducer.setTotalJobCount(15); - } - @Test(timeOut = 1000, invocationCount = 100) private void testNonBlockingReduce() throws Exception { final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), 0); @@ -242,12 +204,8 @@ public class ReducerUnitTest extends BaseTest { @Override public void run() { - try { - final int observedSum = reducer.waitForFinalReduce(); - Assert.assertEquals(observedSum, expectedSum, "Reduce didn't sum to expected value"); - } catch ( InterruptedException ex ) { - Assert.fail("Got interrupted"); - } + final int observedSum = reducer.getReduceResult(); + Assert.assertEquals(observedSum, expectedSum, "Reduce didn't sum to expected value"); } } } \ No newline at end of file From 7d250a789a4f14b8da0ba9fec320bfdaca94e238 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Sun, 23 Dec 2012 08:54:27 -0500 Subject: [PATCH 14/22] ArtificialReadPileupTestProvider now creates GATKSamRecords with good header values --- .../genotyper/ArtificialReadPileupTestProvider.java | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java index e48c4d197..39b81f4a2 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ArtificialReadPileupTestProvider.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; @@ -53,11 +54,11 @@ public class ArtificialReadPileupTestProvider { final String artificialReadName = "synth"; final int artificialRefStart = 1; final int artificialMappingQuality = 60; - Map sample2RG = new HashMap(); + Map sample2RG = new HashMap(); List sampleRGs; List sampleNames = new ArrayList(); private String sampleName(int i) { return sampleNames.get(i); } - private SAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } + private GATKSAMReadGroupRecord sampleRG(String name) { return sample2RG.get(name); } public final int locStart = 105; // start position where we desire artificial variant private final int readLength = 10; // desired read length in pileup public final int readOffset = 4; @@ -75,7 +76,7 @@ public class ArtificialReadPileupTestProvider { for ( int i = 0; i < numSamples; i++ ) { sampleNames.add(String.format("%s%04d", SAMPLE_PREFIX, i)); - SAMReadGroupRecord rg = createRG(sampleName(i)); + GATKSAMReadGroupRecord rg = createRG(sampleName(i)); sampleRGs.add(rg); sample2RG.put(sampleName(i), rg); } @@ -134,8 +135,8 @@ public class ArtificialReadPileupTestProvider { return contexts; } - private SAMReadGroupRecord createRG(String name) { - SAMReadGroupRecord rg = new SAMReadGroupRecord(name); + private GATKSAMReadGroupRecord createRG(String name) { + GATKSAMReadGroupRecord rg = new GATKSAMReadGroupRecord(name); rg.setPlatform("ILLUMINA"); rg.setSample(name); return rg; @@ -189,7 +190,7 @@ public class ArtificialReadPileupTestProvider { read.setMappingQuality(artificialMappingQuality); read.setReferenceName(loc.getContig()); read.setReadNegativeStrandFlag(false); - read.setAttribute("RG", sampleRG(sample).getReadGroupId()); + read.setReadGroup(sampleRG(sample)); pileupElements.add(new PileupElement(read,readOffset,false,isBeforeDeletion, false, isBeforeInsertion,false,false,altBases,Math.abs(eventLength))); From 7bf1f672736974f91ed25ba5aeb4d4729d79d756 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 24 Dec 2012 10:46:47 -0500 Subject: [PATCH 15/22] BQSR optimization: read group x quality score calibration table is thread-local -- AdvancedRecalibrationEngine now uses a thread-local table for the quality score table, and in finalizeData merges these thread-local tables into the final table. Radically reduces the contention for RecalDatum in this very highly used table -- Refactored the utility function to combine two tables into RecalUtils, and created UnitTests for this function, as well as all of RecalibrationTables. Updated combine in RecalibrationReport to use this table combiner function -- Made several core functions in RecalDatum into final methods for performance -- Added RecalibrationTestUtils, a home for recalibration testing utilities --- .../bqsr/AdvancedRecalibrationEngine.java | 41 ++++- .../sting/utils/recalibration/RecalDatum.java | 16 +- .../sting/utils/recalibration/RecalUtils.java | 24 +++ .../recalibration/RecalibrationReport.java | 13 +- .../recalibration/RecalibrationTables.java | 76 ++++++--- .../recalibration/RecalUtilsUnitTest.java | 152 ++++++++++++++++++ .../RecalibrationTablesUnitTest.java | 84 ++++++++++ .../recalibration/RecalibrationTestUtils.java | 49 ++++++ 8 files changed, 414 insertions(+), 41 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index b56e5cc16..255f1fd05 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -25,16 +25,35 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.util.LinkedList; +import java.util.List; + public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { + private final static Logger logger = Logger.getLogger(AdvancedRecalibrationEngine.class); + + final List> allThreadLocalQualityScoreTables = new LinkedList>(); + private ThreadLocal> threadLocalQualityScoreTables = new ThreadLocal>() { + @Override + protected synchronized NestedIntegerArray initialValue() { + final NestedIntegerArray table = recalibrationTables.makeQualityScoreTable(); + allThreadLocalQualityScoreTables.add(table); + return table; + } + }; + @Override public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { final GATKSAMRecord read = recalInfo.getRead(); final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); + final NestedIntegerArray qualityScoreTable = getThreadLocalQualityScoreTable(); for( int offset = 0; offset < read.getReadBases().length; offset++ ) { if( ! recalInfo.skip(offset) ) { @@ -45,7 +64,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp final byte qual = recalInfo.getQual(eventType, offset); final double isError = recalInfo.getErrorFraction(eventType, offset); - incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); + incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) @@ -57,4 +76,24 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp } } } + + /** + * Get a NestedIntegerArray for a QualityScore table specific to this thread + * @return a non-null NestedIntegerArray ready to be used to collect calibration info for the quality score covariate + */ + private NestedIntegerArray getThreadLocalQualityScoreTable() { + return threadLocalQualityScoreTables.get(); + } + + @Override + public void finalizeData() { + // merge in all of the thread local tables + logger.info("Merging " + allThreadLocalQualityScoreTables.size() + " thread-local quality score tables"); + for ( final NestedIntegerArray localTable : allThreadLocalQualityScoreTables ) { + recalibrationTables.combineQualityScoreTable(localTable); + } + allThreadLocalQualityScoreTables.clear(); // cleanup after ourselves + + super.finalizeData(); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 4cacc26c4..8ae2eed82 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -212,49 +212,49 @@ public class RecalDatum { // //--------------------------------------------------------------------------------------------------------------- - public double getNumObservations() { + public final double getNumObservations() { return numObservations; } - public synchronized void setNumObservations(final double numObservations) { + public final synchronized void setNumObservations(final double numObservations) { if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); this.numObservations = numObservations; empiricalQuality = UNINITIALIZED; } - public double getNumMismatches() { + public final double getNumMismatches() { return numMismatches; } @Requires({"numMismatches >= 0"}) - public synchronized void setNumMismatches(final double numMismatches) { + public final synchronized void setNumMismatches(final double numMismatches) { if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); this.numMismatches = numMismatches; empiricalQuality = UNINITIALIZED; } @Requires({"by >= 0"}) - public synchronized void incrementNumObservations(final double by) { + public final synchronized void incrementNumObservations(final double by) { numObservations += by; empiricalQuality = UNINITIALIZED; } @Requires({"by >= 0"}) - public synchronized void incrementNumMismatches(final double by) { + public final synchronized void incrementNumMismatches(final double by) { numMismatches += by; empiricalQuality = UNINITIALIZED; } @Requires({"incObservations >= 0", "incMismatches >= 0"}) @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) - public synchronized void increment(final double incObservations, final double incMismatches) { + public final synchronized void increment(final double incObservations, final double incMismatches) { numObservations += incObservations; numMismatches += incMismatches; empiricalQuality = UNINITIALIZED; } @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"}) - public synchronized void increment(final boolean isError) { + public final synchronized void increment(final boolean isError) { increment(1, isError ? 1 : 0.0); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index e9fe316a8..d4e781fdd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -769,4 +769,28 @@ public class RecalUtils { return base; } } + + /** + * Combines the recalibration data for table1 and table2 into table1 + * + * Note that table1 is the destination, so it is modified + * + * @param table1 the destination table to merge table2 into + * @param table2 the source table to merge into table1 + */ + public static void combineTables(final NestedIntegerArray table1, final NestedIntegerArray table2) { + if ( table1 == null ) throw new IllegalArgumentException("table1 cannot be null"); + if ( table2 == null ) throw new IllegalArgumentException("table2 cannot be null"); + if ( ! Arrays.equals(table1.getDimensions(), table2.getDimensions())) + throw new IllegalArgumentException("Table1 " + Utils.join(",", table1.getDimensions()) + " not equal to " + Utils.join(",", table2.getDimensions())); + + for (final NestedIntegerArray.Leaf row : table2.getAllLeaves()) { + final RecalDatum myDatum = table1.get(row.keys); + + if (myDatum == null) + table1.put(row.value, row.keys); + else + myDatum.combine(row.value); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 871c1eff6..4ff17f302 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -81,7 +81,7 @@ public class RecalibrationReport { /** * Counts the number of unique read groups in the table * - * @param reportTable the GATKReport table containing data for this table + * @param reportTable the GATKReport table containing data for this table * @return the number of unique read groups */ private int countReadGroups(final GATKReportTable reportTable) { @@ -105,19 +105,10 @@ public class RecalibrationReport { * @param other the recalibration report to combine with this one */ public void combine(final RecalibrationReport other) { - for ( int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++ ) { final NestedIntegerArray myTable = recalibrationTables.getTable(tableIndex); final NestedIntegerArray otherTable = other.recalibrationTables.getTable(tableIndex); - - for (final NestedIntegerArray.Leaf row : otherTable.getAllLeaves()) { - final RecalDatum myDatum = myTable.get(row.keys); - - if (myDatum == null) - myTable.put((RecalDatum)row.value, row.keys); - else - myDatum.combine((RecalDatum)row.value); - } + RecalUtils.combineTables(myTable, otherTable); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index 0dd510245..3f968d7f6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -25,11 +25,13 @@ package org.broadinstitute.sting.utils.recalibration; +import com.google.java.contract.Ensures; import org.broadinstitute.sting.utils.collections.LoggingNestedIntegerArray; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import java.io.PrintStream; +import java.util.ArrayList; /** * Utility class to facilitate on-the-fly base quality score recalibration. @@ -38,8 +40,7 @@ import java.io.PrintStream; * Date: 6/20/12 */ -public class RecalibrationTables { - +public final class RecalibrationTables { public enum TableType { READ_GROUP_TABLE(0), QUALITY_SCORE_TABLE(1), @@ -52,49 +53,82 @@ public class RecalibrationTables { } } - private final NestedIntegerArray[] tables; + private final ArrayList> tables; + private final int qualDimension; + private final int eventDimension = EventType.values().length; + private final int numReadGroups; + private final PrintStream log; public RecalibrationTables(final Covariate[] covariates) { this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, null); } - public RecalibrationTables(final Covariate[] covariates, final PrintStream log) { - this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, log); - } - public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) { this(covariates, numReadGroups, null); } public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) { - tables = new NestedIntegerArray[covariates.length]; + tables = new ArrayList>(covariates.length); + for ( int i = 0; i < covariates.length; i++ ) + tables.add(i, null); // initialize so we can set below - final int qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1; - final int eventDimension = EventType.values().length; + qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1; + this.numReadGroups = numReadGroups; + this.log = log; + + tables.set(TableType.READ_GROUP_TABLE.index, + log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : + new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension)); + + tables.set(TableType.QUALITY_SCORE_TABLE.index, makeQualityScoreTable()); - tables[TableType.READ_GROUP_TABLE.index] = log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : - new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension); - tables[TableType.QUALITY_SCORE_TABLE.index] = log == null ? new NestedIntegerArray(numReadGroups, qualDimension, eventDimension) : - new LoggingNestedIntegerArray(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension); for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < covariates.length; i++) - tables[i] = log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : - new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1), - numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension); + tables.set(i, + log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : + new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1), + numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension)); } + @Ensures("result != null") public NestedIntegerArray getReadGroupTable() { - return (NestedIntegerArray)tables[TableType.READ_GROUP_TABLE.index]; + return getTable(TableType.READ_GROUP_TABLE.index); } + @Ensures("result != null") public NestedIntegerArray getQualityScoreTable() { - return (NestedIntegerArray)tables[TableType.QUALITY_SCORE_TABLE.index]; + return getTable(TableType.QUALITY_SCORE_TABLE.index); } + @Ensures("result != null") public NestedIntegerArray getTable(final int index) { - return (NestedIntegerArray)tables[index]; + return tables.get(index); } + @Ensures("result >= 0") public int numTables() { - return tables.length; + return tables.size(); + } + + /** + * Allocate a new quality score table, based on requested parameters + * in this set of tables, without any data in it. The return result + * of this table is suitable for acting as a thread-local cache + * for quality score values + * @return a newly allocated, empty read group x quality score table + */ + public NestedIntegerArray makeQualityScoreTable() { + return log == null + ? new NestedIntegerArray(numReadGroups, qualDimension, eventDimension) + : new LoggingNestedIntegerArray(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension); + } + + /** + * Merge in the quality score table information from qualityScoreTable into this + * recalibration table's quality score table. + * + * @param qualityScoreTable the quality score table we want to merge in + */ + public void combineQualityScoreTable(final NestedIntegerArray qualityScoreTable) { + RecalUtils.combineTables(getQualityScoreTable(), qualityScoreTable); } } diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java new file mode 100644 index 000000000..500a41e74 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +public final class RecalUtilsUnitTest extends BaseTest { + private class Row { + int rg, qual, ne, no; + + private Row(final Row copy) { + this(copy.rg, copy.qual, copy.ne, copy.no); + } + + private Row(int rg, int qual, int ne, int no) { + this.rg = rg; + this.qual = qual; + this.ne = ne; + this.no = no; + } + + @Override + public String toString() { + return "Row{" + + "" + rg + + ", " + qual + + ", " + ne + + ", " + no + + '}'; + } + } + + @DataProvider(name = "CombineTablesProvider") + public Object[][] createCombineTablesProvider() { + List tests = new ArrayList(); + + final List rows = new ArrayList(); + for ( final int rg : Arrays.asList(0, 1) ) { + for ( final int qual : Arrays.asList(0, 1) ) { + rows.add(new Row(rg, qual, 1, 10)); + } + } + + logger.warn("Number of rows " + rows.size()); + + List> permutations = new LinkedList>(); + permutations.addAll(Utils.makePermutations(rows, 1, false)); + permutations.addAll(Utils.makePermutations(rows, 2, false)); + permutations.addAll(Utils.makePermutations(rows, 3, false)); + + // adding 1 row to 2 + for ( final List table1 : permutations ) { + for ( final Row table2 : rows ) { + tests.add(new Object[]{table1, Arrays.asList(table2)}); + } + } + + // adding 2 rows to 1 + for ( final List table1 : permutations ) { + for ( final Row table2 : rows ) { + tests.add(new Object[]{Arrays.asList(table2), table1}); + } + } + + for ( final List table1 : permutations ) { + for ( final List table2 : permutations ) { + tests.add(new Object[]{table1, table2}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "CombineTablesProvider") + public void testCombineTables(final List table1, final List table2) { + final NestedIntegerArray nia1 = makeTable(table1); + final NestedIntegerArray nia2 = makeTable(table2); + final List expectedRows = makeExpected(table1, table2); + final NestedIntegerArray expected = makeTable(expectedRows); + RecalUtils.combineTables(nia1, nia2); + + Assert.assertEquals(nia1.getDimensions(), expected.getDimensions()); + Assert.assertEquals(nia1.getAllValues().size(), expected.getAllValues().size()); + + for ( final NestedIntegerArray.Leaf leaf : expected.getAllLeaves() ) { + final RecalDatum actual = nia1.get(leaf.keys); + Assert.assertEquals(actual.getNumMismatches(), leaf.value.getNumMismatches()); + Assert.assertEquals(actual.getNumObservations(), leaf.value.getNumObservations()); + } + } + + public List makeExpected(final List table1, final List table2) { + final List combined = new LinkedList(); + for ( final Row t1 : table1 ) combined.add(new Row(t1)); + for ( final Row t2 : table2 ) { + combine(combined, t2); + } + return combined; + } + + private void combine(final List combined, final Row row) { + for ( final Row c : combined ) { + if ( c.rg == row.rg && c.qual == row.qual ) { + c.ne += row.ne; + c.no += row.no; + return; + } + } + + combined.add(new Row(row)); + } + + public NestedIntegerArray makeTable(final List rows) { + final NestedIntegerArray x = new NestedIntegerArray(3, 3); + for ( final Row r : rows ) + x.put(new RecalDatum(r.no, r.ne, (byte)10), r.rg, r.qual); + return x; + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java new file mode 100644 index 000000000..93e52ae83 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import org.broadinstitute.sting.utils.recalibration.covariates.*; +import org.testng.Assert; +import org.testng.annotations.Test; + +public final class RecalibrationTablesUnitTest extends BaseTest { + @Test + public void basicTest() { + final Covariate[] covariates = RecalibrationTestUtils.makeInitializedStandardCovariates(); + final int numReadGroups = 6; + final RecalibrationTables tables = new RecalibrationTables(covariates, numReadGroups); + + final Covariate qualCov = covariates[1]; + final Covariate cycleCov = covariates[2]; + final Covariate contextCov = covariates[3]; + + Assert.assertEquals(tables.numTables(), covariates.length); + + Assert.assertNotNull(tables.getReadGroupTable()); + Assert.assertEquals(tables.getReadGroupTable(), tables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE.index)); + testDimensions(tables.getReadGroupTable(), numReadGroups); + + Assert.assertNotNull(tables.getQualityScoreTable()); + Assert.assertEquals(tables.getQualityScoreTable(), tables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE.index)); + testDimensions(tables.getQualityScoreTable(), numReadGroups, qualCov.maximumKeyValue() + 1); + + Assert.assertNotNull(tables.getTable(2)); + testDimensions(tables.getTable(2), numReadGroups, qualCov.maximumKeyValue() + 1, cycleCov.maximumKeyValue() + 1); + + Assert.assertNotNull(tables.getTable(3)); + testDimensions(tables.getTable(3), numReadGroups, qualCov.maximumKeyValue() + 1, contextCov.maximumKeyValue() + 1); + } + + private void testDimensions(final NestedIntegerArray table, final int ... dimensions) { + final int[] dim = new int[dimensions.length+1]; + System.arraycopy(dimensions, 0, dim, 0, dimensions.length); + dim[dimensions.length] = EventType.values().length; + Assert.assertEquals(table.getDimensions().length, dim.length); + + for ( int i = 0; i < dim.length; i++ ) { + Assert.assertEquals(table.getDimensions()[i], dim[i], "Table dimensions not expected at dim " + i); + } + } + + @Test + public void basicMakeQualityScoreTable() { + final Covariate[] covariates = RecalibrationTestUtils.makeInitializedStandardCovariates(); + final int numReadGroups = 6; + final RecalibrationTables tables = new RecalibrationTables(covariates, numReadGroups); + + final Covariate qualCov = covariates[1]; + final NestedIntegerArray copy = tables.makeQualityScoreTable(); + testDimensions(copy, numReadGroups, qualCov.maximumKeyValue()+1); + Assert.assertEquals(copy.getAllValues().size(), 0); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java new file mode 100644 index 000000000..bf3917e70 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTestUtils.java @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.recalibration.covariates.*; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 12/23/12 + * Time: 1:06 PM + * To change this template use File | Settings | File Templates. + */ +public class RecalibrationTestUtils { + public static Covariate[] makeInitializedStandardCovariates() { + final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); + final Covariate[] covariates = new Covariate[4]; + covariates[0] = new ReadGroupCovariate(); + covariates[1] = new QualityScoreCovariate(); + covariates[2] = new ContextCovariate(); + covariates[3] = new CycleCovariate(); + for ( Covariate cov : covariates ) cov.initialize(RAC); + return covariates; + } +} From 7ec7a5d6b6e43f1ae66060949cfb7b4308320859 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 24 Dec 2012 11:27:32 -0500 Subject: [PATCH 16/22] Misc. improvements to clover in build.xml -- Allow instrument level to be overridden on command line with -Dclover.instrument.level=statement --- build.xml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/build.xml b/build.xml index 0cfc1a8cd..f91ff6a9a 100644 --- a/build.xml +++ b/build.xml @@ -109,6 +109,12 @@ + + + + + + @@ -1120,9 +1126,6 @@ - - - @@ -1138,7 +1141,7 @@ - + From 04cc75aaec54acad39a65c48c57cc4ba02547ae3 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 24 Dec 2012 13:26:58 -0500 Subject: [PATCH 17/22] Minor cleanup and expansion of the RecalDatum unit tests --- .../sting/utils/recalibration/RecalDatum.java | 8 ------ .../recalibration/RecalDatumUnitTest.java | 27 +++++++++++++++++++ .../RecalibrationReportUnitTest.java | 14 +++++++--- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 8ae2eed82..1ab3b10c4 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -135,14 +135,6 @@ public class RecalDatum { this.estimatedQReported = estimatedQReported; } - public static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { - final Random random = new Random(); - final int nObservations = random.nextInt(maxObservations); - final int nErrors = random.nextInt(maxErrors); - final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); - return new RecalDatum(nObservations, nErrors, (byte)qual); - } - public final double getEstimatedQReported() { return estimatedQReported; } diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java index 715acad03..0f0c74362 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java @@ -106,6 +106,11 @@ public class RecalDatumUnitTest extends BaseTest { Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual()); BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalQuality(), cfg.getErrorRatePhredScaled()); BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate()); + + final double e = datum.getEmpiricalQuality(); + Assert.assertTrue(datum.getEmpiricalQualityAsByte() >= Math.floor(e)); + Assert.assertTrue(datum.getEmpiricalQualityAsByte() <= Math.ceil(e)); + Assert.assertNotNull(datum.toString()); } @Test(dataProvider = "RecalDatumTestProvider") @@ -145,10 +150,32 @@ public class RecalDatumUnitTest extends BaseTest { cfg.exTotal++; assertBasicFeaturesOfRecalDatum(datum, cfg); + datum = cfg.makeRecalDatum(); + datum.increment(false); + cfg.exTotal++; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.incrementNumObservations(2); + cfg.exTotal += 2; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); + datum.incrementNumMismatches(2); + cfg.exError += 2; + assertBasicFeaturesOfRecalDatum(datum, cfg); + + datum = cfg.makeRecalDatum(); datum.increment(10, 5); cfg.exError += 5; cfg.exTotal += 10; assertBasicFeaturesOfRecalDatum(datum, cfg); } + + @Test + public void testNoObs() { + final RecalDatum rd = new RecalDatum(0, 0, (byte)10); + Assert.assertEquals(rd.getEmpiricalErrorRate(), 0.0); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index d597b9f2c..b190f2ff2 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -21,6 +21,14 @@ import java.util.*; * @since 4/21/12 */ public class RecalibrationReportUnitTest { + private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { + final Random random = new Random(); + final int nObservations = random.nextInt(maxObservations); + final int nErrors = random.nextInt(maxErrors); + final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); + return new RecalDatum(nObservations, nErrors, (byte)qual); + } + @Test(enabled = false) public void testOutput() { final int length = 100; @@ -86,12 +94,12 @@ public class RecalibrationReportUnitTest { final int[] covariates = rc.getKeySet(offset, errorMode); final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; - rgTable.put(RecalDatum.createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.index); - qualTable.put(RecalDatum.createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.index); + rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.index); + qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.index); nKeys += 2; for (int j = 0; j < optionalCovariates.size(); j++) { final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j); - covTable.put(RecalDatum.createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], j, covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j], errorMode.index); + covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], j, covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j], errorMode.index); nKeys++; } } From af9746af52f6186209b7b19239f74817666b4389 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 24 Dec 2012 13:43:04 -0500 Subject: [PATCH 18/22] Fix merge failure --- .../broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java | 1 + 1 file changed, 1 insertion(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index dcaaafefd..7692c58e2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -38,6 +38,7 @@ import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; From 64c3a0ff62f987bc29c79da3a709a4d09942d5b9 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Mon, 24 Dec 2012 13:53:29 -0500 Subject: [PATCH 19/22] Remove dependance on clover in build.xml --- build.xml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/build.xml b/build.xml index f91ff6a9a..47e4eeb47 100644 --- a/build.xml +++ b/build.xml @@ -1045,7 +1045,6 @@ - @@ -1136,6 +1135,10 @@ + + + + From efceb0d48c4ee466ad762c53e8fe3e54b4823fdc Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 26 Dec 2012 14:30:51 -0500 Subject: [PATCH 20/22] Check for well-encoded reads while fixing mis-encoded ones --- .../sting/utils/sam/MisencodedBaseQualityReadTransformer.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java index cac51239a..17d40e530 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/MisencodedBaseQualityReadTransformer.java @@ -47,6 +47,8 @@ public class MisencodedBaseQualityReadTransformer extends ReadTransformer { final byte[] quals = read.getBaseQualities(); for ( int i = 0; i < quals.length; i++ ) { quals[i] -= encodingFixValue; + if ( quals[i] < 0 ) + throw new UserException.BadInput("while fixing mis-encoded base qualities we encountered a read that was correctly encoded; we cannot handle such a mixture of reads so unfortunately the BAM must be fixed with some other tool"); } read.setBaseQualities(quals); return read; From 75d5b88a3dfb7436c31e17fd49ac3f4b3c509bab Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 26 Dec 2012 15:35:50 -0500 Subject: [PATCH 21/22] Enabling the Recal Report unit test (which looks like it was never ever enabled) --- .../recalibration/RecalibrationReport.java | 4 +- .../covariates/CycleCovariate.java | 2 +- .../RecalibrationReportUnitTest.java | 44 +++++++------------ 3 files changed, 18 insertions(+), 32 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 4ff17f302..081221da4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -69,12 +69,12 @@ public class RecalibrationReport { } - protected RecalibrationReport(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) { + protected RecalibrationReport(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) { this.quantizationInfo = quantizationInfo; this.recalibrationTables = recalibrationTables; + this.requestedCovariates = requestedCovariates; this.argumentTable = argumentTable; this.RAC = RAC; - this.requestedCovariates = null; this.optionalCovariateIndexes = null; } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java index 6bff833e4..e3c822821 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java @@ -50,7 +50,7 @@ import java.util.EnumSet; public class CycleCovariate implements StandardCovariate { private int MAXIMUM_CYCLE_VALUE; - private static final int CUSHION_FOR_INDELS = 4; + public static final int CUSHION_FOR_INDELS = 4; private String default_platform = null; private static final EnumSet DISCRETE_CYCLE_PLATFORMS = EnumSet.of(NGSPlatform.ILLUMINA, NGSPlatform.SOLID, NGSPlatform.PACBIO, NGSPlatform.COMPLETE_GENOMICS); diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index b190f2ff2..9707ed078 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -4,16 +4,12 @@ import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollectio import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; import org.testng.annotations.Test; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; import java.util.*; /** @@ -29,7 +25,7 @@ public class RecalibrationReportUnitTest { return new RecalDatum(nObservations, nErrors, (byte)qual); } - @Test(enabled = false) + @Test(enabled = true) public void testOutput() { final int length = 100; @@ -79,7 +75,7 @@ public class RecalibrationReportUnitTest { readQuals[i] = 20; read.setBaseQualities(readQuals); - final int expectedKeys = expectedNumberOfKeys(4, length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); + final int expectedKeys = expectedNumberOfKeys(length, RAC.INDELS_CONTEXT_SIZE, RAC.MISMATCHES_CONTEXT_SIZE); int nKeys = 0; // keep track of how many keys were produced final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); @@ -99,35 +95,25 @@ public class RecalibrationReportUnitTest { nKeys += 2; for (int j = 0; j < optionalCovariates.size(); j++) { final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j); - covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], j, covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j], errorMode.index); - nKeys++; + final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j]; + if ( covValue >= 0 ) { + covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.index); + nKeys++; + } } } } Assert.assertEquals(nKeys, expectedKeys); - - final RecalibrationReport report = new RecalibrationReport(quantizationInfo, recalibrationTables, RAC.generateReportTable("ignore"), RAC); - - File output = new File("RecalibrationReportUnitTestOutuput.grp"); - PrintStream out; - try { - out = new PrintStream(output); - } catch (FileNotFoundException e) { - throw new ReviewedStingException("couldn't create the file " + output, e); - } - report.output(out); - - RecalibrationReport loadedReport = new RecalibrationReport(output); - - Assert.assertTrue(report.equals(loadedReport)); - if (!output.delete()) - throw new ReviewedStingException("File could not be deleted " + output); } - private static int expectedNumberOfKeys (int nCovariates, int readLength, int indelContextSize, int mismatchesContextSize) { - int nommcs = readLength >= mismatchesContextSize ? mismatchesContextSize-1 : readLength; - int noincs = readLength >= indelContextSize ? 2*(indelContextSize-1) : 2*readLength; - return (nCovariates * readLength * 3) - nommcs - noincs; + private static int expectedNumberOfKeys (int readLength, int indelContextSize, int mismatchesContextSize) { + final int numCovariates = 4; + final int numTables = 3; + final int mismatchContextPadding = mismatchesContextSize - 1; + final int indelContextPadding = 2 * (indelContextSize - 1); + final int indelCyclePadding = 2 * (2 * CycleCovariate.CUSHION_FOR_INDELS); + + return (numCovariates * numTables * readLength) - mismatchContextPadding - indelContextPadding - indelCyclePadding; } } From 275575462f9bf708d223c9ba497e5908a6deced9 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Wed, 26 Dec 2012 15:46:21 -0500 Subject: [PATCH 22/22] Protect against non-standard ref bases. Ryan, please review. --- .../walkers/haplotypecaller/LikelihoodCalculationEngine.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 59f241cdb..e1a1ed6d2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -120,6 +120,11 @@ public class LikelihoodCalculationEngine { for( int jjj = 0; jjj < numHaplotypes; jjj++ ) { final Haplotype haplotype = haplotypes.get(jjj); + + // TODO -- need to test against a reference/position with non-standard bases + if ( !Allele.acceptableAlleleBases(haplotype.getBases(), false) ) + continue; + final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); previousHaplotypeSeen = haplotype;