diff --git a/.gitignore b/.gitignore index 8623fa076..927caf98d 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,8 @@ queueScatterGather /bar* integrationtests/ public/testdata/onTheFlyOutputTest.vcf +build/ +dist/ +dump/ +lib/ +out/ diff --git a/build.xml b/build.xml index f681ddafa..c6b1afc56 100644 --- a/build.xml +++ b/build.xml @@ -22,7 +22,9 @@ ~ OTHER DEALINGS IN THE SOFTWARE. --> - + Compile and distribute the Sting toolkit @@ -250,11 +252,14 @@ + + - + + @@ -262,6 +267,15 @@ uri="antlib:org.apache.ivy.ant" classpath="${ivy.jar.dir}/${ivy.jar.file}"/> + + + + + @@ -295,7 +309,7 @@ - + @@ -577,6 +591,7 @@ docletpathref="doclet.classpath" classpathref="external.dependencies" classpath="${java.classes}" + maxmemory="2g" additionalparam="-build-timestamp "${build.timestamp}" -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet"> @@ -780,6 +795,7 @@ docletpathref="doclet.classpath" classpathref="external.dependencies" classpath="${java.classes}" + maxmemory="2g" additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet"> @@ -940,6 +956,28 @@ + + + + + + + + + + + + + + + + + + + @@ -1177,7 +1215,7 @@ - + diff --git a/intellij_example.tar.bz2 b/intellij_example.tar.bz2 new file mode 100644 index 000000000..bce16045c Binary files /dev/null and b/intellij_example.tar.bz2 differ diff --git a/ivy.xml b/ivy.xml index 0761cb411..1d2f95dc1 100644 --- a/ivy.xml +++ b/ivy.xml @@ -46,7 +46,8 @@ - + + @@ -78,8 +79,8 @@ - - + + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java new file mode 100755 index 000000000..59357e1c4 --- /dev/null +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.pileup.*; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.io.PrintStream; +import java.util.*; + +public class AlleleBiasedDownsamplingUtils { + + /** + * Computes an allele biased version of the given pileup + * + * @param pileup the original pileup + * @param downsamplingFraction the fraction of total reads to remove per allele + * @param log logging output + * @return allele biased pileup + */ + public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { + // special case removal of all or no reads + if ( downsamplingFraction <= 0.0 ) + return pileup; + if ( downsamplingFraction >= 1.0 ) + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList()); + + final ArrayList[] alleleStratifiedElements = new ArrayList[4]; + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i] = new ArrayList(); + + // start by stratifying the reads by the alleles they represent at this position + for( final PileupElement pe : pileup ) { + // abort if we have a reduced read - we do not want to remove it! + if ( pe.getRead().isReducedRead() ) + return pileup; + + final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase()); + if ( baseIndex != -1 ) + alleleStratifiedElements[baseIndex].add(pe); + } + + // Down-sample *each* allele by the contamination fraction applied to the entire pileup. + // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. + int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor + final TreeSet elementsToKeep = new TreeSet(new Comparator() { + @Override + public int compare(PileupElement element1, PileupElement element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + for ( int i = 0; i < 4; i++ ) { + final ArrayList alleleList = alleleStratifiedElements[i]; + if ( alleleList.size() <= numReadsToRemove ) + logAllElements(alleleList, log); + else + elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log)); + } + + // clean up pointers so memory can be garbage collected if needed + for ( int i = 0; i < 4; i++ ) + alleleStratifiedElements[i].clear(); + + return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); + } + + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to keep + * + * @param elements original list of records + * @param numElementsToRemove the number of records to remove + * @param log logging output + * @return the list of pileup elements TO KEEP + */ + private static List downsampleElements(final ArrayList elements, final int numElementsToRemove, final PrintStream log) { + final int pileupSize = elements.size(); + final BitSet itemsToRemove = new BitSet(pileupSize); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + ArrayList elementsToKeep = new ArrayList(pileupSize - numElementsToRemove); + for ( int i = 0; i < pileupSize; i++ ) { + if ( itemsToRemove.get(i) ) + logRead(elements.get(i).getRead(), log); + else + elementsToKeep.add(elements.get(i)); + } + + return elementsToKeep; + } + + /** + * Computes reads to remove based on an allele biased down-sampling + * + * @param alleleReadMap original list of records per allele + * @param downsamplingFraction the fraction of total reads to remove per allele + * @param log logging output + * @return list of reads TO REMOVE from allele biased down-sampling + */ + public static List selectAlleleBiasedReads(final Map> alleleReadMap, final double downsamplingFraction, final PrintStream log) { + int totalReads = 0; + for ( final List reads : alleleReadMap.values() ) + totalReads += reads.size(); + + // Down-sample *each* allele by the contamination fraction applied to the entire pileup. + int numReadsToRemove = (int)(totalReads * downsamplingFraction); + final List readsToRemove = new ArrayList(numReadsToRemove * alleleReadMap.size()); + for ( final List reads : alleleReadMap.values() ) { + if ( reads.size() <= numReadsToRemove ) { + readsToRemove.addAll(reads); + logAllReads(reads, log); + } else { + readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log)); + } + } + + return readsToRemove; + } + + /** + * Performs allele biased down-sampling on a pileup and computes the list of elements to remove + * + * @param reads original list of records + * @param numElementsToRemove the number of records to remove + * @param log logging output + * @return the list of pileup elements TO REMOVE + */ + private static List downsampleReads(final List reads, final int numElementsToRemove, final PrintStream log) { + final int pileupSize = reads.size(); + final BitSet itemsToRemove = new BitSet(pileupSize); + for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { + itemsToRemove.set(selectedIndex); + } + + ArrayList readsToRemove = new ArrayList(pileupSize - numElementsToRemove); + for ( int i = 0; i < pileupSize; i++ ) { + if ( itemsToRemove.get(i) ) { + final GATKSAMRecord read = reads.get(i); + readsToRemove.add(read); + logRead(read, log); + } + } + + return readsToRemove; + } + + private static void logAllElements(final List elements, final PrintStream log) { + if ( log != null ) { + for ( final PileupElement p : elements ) + logRead(p.getRead(), log); + } + } + + private static void logAllReads(final List reads, final PrintStream log) { + if ( log != null ) { + for ( final GATKSAMRecord read : reads ) + logRead(read, log); + } + } + + private static void logRead(final SAMRecord read, final PrintStream log) { + if ( log != null ) { + final SAMReadGroupRecord readGroup = read.getReadGroup(); + log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit())); + } + } +} diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index d714ca185..d0bcd0eb3 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -28,75 +28,56 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.threading.ThreadLocalArray; public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - // optimizations: don't reallocate an array each time - private byte[] tempQualArray; - private boolean[] tempErrorArray; + // optimization: only allocate temp arrays once per thread + private final ThreadLocal threadLocalTempQualArray = new ThreadLocalArray(EventType.values().length, byte.class); + private final ThreadLocal threadLocalTempFractionalErrorArray = new ThreadLocalArray(EventType.values().length, double.class); public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { super.initialize(covariates, recalibrationTables); - tempQualArray = new byte[EventType.values().length]; - tempErrorArray = new boolean[EventType.values().length]; } - /** - * Loop through the list of requested covariates and pick out the value from the read, offset, and reference - * Using the list of covariate values as a key, pick out the RecalDatum and increment, - * adding one to the number of observations and potentially one to the number of mismatches for all three - * categories (mismatches, insertions and deletions). - * - * @param pileupElement The pileup element to update - * @param refBase The reference base at this locus - */ - public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { - final int offset = pileupElement.getOffset(); - final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); + @Override + public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { + if( !skip[offset] ) { + final ReadCovariates readCovariates = covariateKeySetFrom(read); - tempQualArray[EventType.BASE_SUBSTITUTION.index] = pileupElement.getQual(); - tempErrorArray[EventType.BASE_SUBSTITUTION.index] = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase); - tempQualArray[EventType.BASE_INSERTION.index] = pileupElement.getBaseInsertionQual(); - tempErrorArray[EventType.BASE_INSERTION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterInsertion() : pileupElement.isBeforeInsertion(); - tempQualArray[EventType.BASE_DELETION.index] = pileupElement.getBaseDeletionQual(); - tempErrorArray[EventType.BASE_DELETION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterDeletedBase() : pileupElement.isBeforeDeletedBase(); + byte[] tempQualArray = threadLocalTempQualArray.get(); + double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get(); - for (final EventType eventType : EventType.values()) { - final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; - final byte qual = tempQualArray[eventIndex]; - final boolean isError = tempErrorArray[eventIndex]; + tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset]; + tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset]; + tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset]; + tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset]; - final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); + for (final EventType eventType : EventType.values()) { + final int[] keys = readCovariates.getKeySet(offset, eventType); + final int eventIndex = eventType.index; + final byte qual = tempQualArray[eventIndex]; + final double isError = tempFractionalErrorArray[eventIndex]; - final NestedIntegerArray qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(isError); + combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); - else - covPreviousDatum.increment(isError); + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); + + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + } + } } } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java index 98a96fbfb..654e0af09 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -1,8 +1,5 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import java.util.HashMap; -import java.util.Map; - /** * An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base. * @@ -10,47 +7,42 @@ import java.util.Map; * @since 6/15/12 */ public class BaseAndQualsCounts extends BaseCounts { - private final Map sumInsertionQuals; - private final Map sumDeletionQuals; + private final long[] sumInsertionQuals; + private final long[] sumDeletionQuals; public BaseAndQualsCounts() { super(); - this.sumInsertionQuals = new HashMap(); - this.sumDeletionQuals = new HashMap(); - for (BaseIndex i : BaseIndex.values()) { - sumInsertionQuals.put(i, 0L); - sumDeletionQuals.put(i, 0L); + this.sumInsertionQuals = new long[BaseIndex.values().length]; + this.sumDeletionQuals = new long[BaseIndex.values().length]; + for (final BaseIndex i : BaseIndex.values()) { + sumInsertionQuals[i.index] = 0L; + sumDeletionQuals[i.index] = 0L; } } - public void incr(byte base, byte baseQual, byte insQual, byte delQual) { - super.incr(base, baseQual); - BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // do not allow Ns - sumInsertionQuals.put(i, sumInsertionQuals.get(i) + insQual); - sumDeletionQuals.put(i, sumDeletionQuals.get(i) + delQual); - } + public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { + final BaseIndex i = BaseIndex.byteToBase(base); + super.incr(i, baseQual); + sumInsertionQuals[i.index] += insQual; + sumDeletionQuals[i.index] += delQual; } - public void decr(byte base, byte baseQual, byte insQual, byte delQual) { - super.decr(base, baseQual); - BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // do not allow Ns - sumInsertionQuals.put(i, sumInsertionQuals.get(i) - insQual); - sumDeletionQuals.put(i, sumDeletionQuals.get(i) - delQual); - } + public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { + final BaseIndex i = BaseIndex.byteToBase(base); + super.decr(i, baseQual); + sumInsertionQuals[i.index] -= insQual; + sumDeletionQuals[i.index] -= delQual; } - public byte averageInsertionQualsOfMostCommonBase() { - return getGenericAverageQualOfMostCommonBase(sumInsertionQuals); + public byte averageInsertionQualsOfBase(final BaseIndex base) { + return getGenericAverageQualOfBase(base, sumInsertionQuals); } - public byte averageDeletionQualsOfMostCommonBase() { - return getGenericAverageQualOfMostCommonBase(sumDeletionQuals); + public byte averageDeletionQualsOfBase(final BaseIndex base) { + return getGenericAverageQualOfBase(base, sumDeletionQuals); } - private byte getGenericAverageQualOfMostCommonBase(Map sumQuals) { - BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts()); - return (byte) (sumQuals.get(base) / getCount(base)); + private byte getGenericAverageQualOfBase(final BaseIndex base, final long[] sumQuals) { + return (byte) (sumQuals[base.index] / countOfBase(base)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index ed5802d38..778b8300a 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -3,11 +3,9 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import java.util.EnumMap; -import java.util.Map; /** - * An object to keep track of the number of occurences of each base and it's quality. + * An object to keep track of the number of occurrences of each base and it's quality. * * User: depristo * Date: 4/8/11 @@ -18,206 +16,225 @@ import java.util.Map; public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N; public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); - private final Map counts; // keeps track of the base counts - private final Map sumQuals; // keeps track of the quals of each base + private final int[] counts; // keeps track of the base counts + private final long[] sumQuals; // keeps track of the quals of each base + private int totalCount = 0; // keeps track of total count since this is requested so often public BaseCounts() { - counts = new EnumMap(BaseIndex.class); - sumQuals = new EnumMap(BaseIndex.class); - for (BaseIndex i : BaseIndex.values()) { - counts.put(i, 0); - sumQuals.put(i, 0L); + counts = new int[BaseIndex.values().length]; + sumQuals = new long[BaseIndex.values().length]; + for (final BaseIndex i : BaseIndex.values()) { + counts[i.index] = 0; + sumQuals[i.index] = 0L; } } public static BaseCounts createWithCounts(int[] countsACGT) { BaseCounts baseCounts = new BaseCounts(); - baseCounts.counts.put(BaseIndex.A, countsACGT[0]); - baseCounts.counts.put(BaseIndex.C, countsACGT[1]); - baseCounts.counts.put(BaseIndex.G, countsACGT[2]); - baseCounts.counts.put(BaseIndex.T, countsACGT[3]); + baseCounts.counts[BaseIndex.A.index] = countsACGT[0]; + baseCounts.counts[BaseIndex.C.index] = countsACGT[1]; + baseCounts.counts[BaseIndex.G.index] = countsACGT[2]; + baseCounts.counts[BaseIndex.T.index] = countsACGT[3]; + baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3]; return baseCounts; } @Requires("other != null") - public void add(BaseCounts other) { - for (BaseIndex i : BaseIndex.values()) - counts.put(i, counts.get(i) + other.counts.get(i)); + public void add(final BaseCounts other) { + for (final BaseIndex i : BaseIndex.values()) { + final int otherCount = other.counts[i.index]; + counts[i.index] += otherCount; + totalCount += otherCount; + } } @Requires("other != null") - public void sub(BaseCounts other) { - for (BaseIndex i : BaseIndex.values()) - counts.put(i, counts.get(i) - other.counts.get(i)); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(byte base) { - BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) // no Ns - counts.put(i, counts.get(i) + 1); - } - - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(byte base, byte qual) { - BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // no Ns - counts.put(i, counts.get(i) + 1); - sumQuals.put(i, sumQuals.get(i) + qual); + public void sub(final BaseCounts other) { + for (final BaseIndex i : BaseIndex.values()) { + final int otherCount = other.counts[i.index]; + counts[i.index] -= otherCount; + totalCount -= otherCount; } } - @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(byte base) { - BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) // no Ns - counts.put(i, counts.get(i) - 1); + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") + public void incr(final byte base) { + final BaseIndex i = BaseIndex.byteToBase(base); + counts[i.index]++; + totalCount++; + } + + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") + public void incr(final BaseIndex base, final byte qual) { + counts[base.index]++; + totalCount++; + sumQuals[base.index] += qual; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(byte base, byte qual) { - BaseIndex i = BaseIndex.byteToBase(base); - if (i != null) { // no Ns - counts.put(i, counts.get(i) - 1); - sumQuals.put(i, sumQuals.get(i) - qual); - } + public void decr(final byte base) { + final BaseIndex i = BaseIndex.byteToBase(base); + counts[i.index]--; + totalCount--; } - - - @Ensures("result >= 0") - public int getCount(byte base) { - return getCount(BaseIndex.byteToBase(base)); + @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") + public void decr(final BaseIndex base, final byte qual) { + counts[base.index]--; + totalCount--; + sumQuals[base.index] -= qual; } @Ensures("result >= 0") - public int getCount(BaseIndex base) { - return counts.get(base); - } - - @Ensures("result >= 0") - public long getSumQuals(byte base) { + public long getSumQuals(final byte base) { return getSumQuals(BaseIndex.byteToBase(base)); } @Ensures("result >= 0") - public long getSumQuals(BaseIndex base) { - return sumQuals.get(base); + public long getSumQuals(final BaseIndex base) { + return sumQuals[base.index]; } @Ensures("result >= 0") - public byte averageQuals(byte base) { - return (byte) (getSumQuals(base) / getCount(base)); + public byte averageQuals(final byte base) { + return (byte) (getSumQuals(base) / countOfBase(base)); } @Ensures("result >= 0") - public byte averageQuals(BaseIndex base) { - return (byte) (getSumQuals(base) / getCount(base)); + public byte averageQuals(final BaseIndex base) { + return (byte) (getSumQuals(base) / countOfBase(base)); + } + + @Ensures("result >= 0") + public int countOfBase(final byte base) { + return countOfBase(BaseIndex.byteToBase(base)); + } + + @Ensures("result >= 0") + public int countOfBase(final BaseIndex base) { + return counts[base.index]; + } + + @Ensures("result >= 0") + public long sumQualsOfBase(final BaseIndex base) { + return sumQuals[base.index]; + } + + @Ensures("result >= 0") + public byte averageQualsOfBase(final BaseIndex base) { + return (byte) (sumQualsOfBase(base) / countOfBase(base)); + } + + + @Ensures("result >= 0") + public int totalCount() { + return totalCount; + } + + /** + * Given a base , it returns the proportional count of this base compared to all other bases + * + * @param base base + * @return the proportion of this base over all other bases + */ + @Ensures({"result >=0.0", "result<= 1.0"}) + public double baseCountProportion(final byte base) { + return baseCountProportion(BaseIndex.byteToBase(base)); + } + + /** + * Given a base , it returns the proportional count of this base compared to all other bases + * + * @param baseIndex base + * @return the proportion of this base over all other bases + */ + @Ensures({"result >=0.0", "result<= 1.0"}) + public double baseCountProportion(final BaseIndex baseIndex) { + return (totalCount == 0) ? 0.0 : (double)counts[baseIndex.index] / (double)totalCount; + } + + @Ensures("result != null") + public String toString() { + StringBuilder b = new StringBuilder(); + for (final BaseIndex i : BaseIndex.values()) { + b.append(i.toString()).append("=").append(counts[i.index]).append(","); + } + return b.toString(); } public byte baseWithMostCounts() { return baseIndexWithMostCounts().getByte(); } - @Ensures("result >= 0") - public int countOfMostCommonBase() { - return counts.get(baseIndexWithMostCounts()); - } - - @Ensures("result >= 0") - public long sumQualsOfMostCommonBase() { - return sumQuals.get(baseIndexWithMostCounts()); - } - - @Ensures("result >= 0") - public byte averageQualsOfMostCommonBase() { - return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase()); - } - - - @Ensures("result >= 0") - public int totalCount() { - int sum = 0; - for (int c : counts.values()) - sum += c; - - return sum; - } - - /** - * Given a base , it returns the proportional count of this base compared to all other bases - * - * @param base - * @return the proportion of this base over all other bases - */ - @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(byte base) { - return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount(); - } - - /** - * Given a base , it returns the proportional count of this base compared to all other bases - * - * @param baseIndex - * @return the proportion of this base over all other bases - */ - @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportion(BaseIndex baseIndex) { - int total = totalCount(); - if (total == 0) - return 0.0; - return (double) counts.get(baseIndex) / totalCount(); - } - - - @Ensures("result != null") - public String toString() { - StringBuilder b = new StringBuilder(); - for (Map.Entry elt : counts.entrySet()) { - b.append(elt.toString()).append("=").append(elt.getValue()).append(","); - } - return b.toString(); - } - @Ensures("result != null") public BaseIndex baseIndexWithMostCounts() { BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (BaseIndex i : counts.keySet()) - if (counts.get(i) > counts.get(maxI)) + for (final BaseIndex i : BaseIndex.values()) { + if (counts[i.index] > counts[maxI.index]) maxI = i; + } return maxI; } @Ensures("result != null") public BaseIndex baseIndexWithMostCountsWithoutIndels() { - BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS; - for (BaseIndex index : counts.keySet()) - if (index.isNucleotide() && counts.get(index) > counts.get(mostCounts)) - mostCounts = index; - return mostCounts; + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (final BaseIndex i : BaseIndex.values()) { + if (i.isNucleotide() && counts[i.index] > counts[maxI.index]) + maxI = i; + } + return maxI; + } + + private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) { + final int targetCount = counts[targetIndex.index]; + final int testCount = counts[testIndex.index]; + return ( targetCount > testCount || (targetCount == testCount && sumQuals[targetIndex.index] > sumQuals[testIndex.index]) ); + } + + public byte baseWithMostProbability() { + return baseIndexWithMostProbability().getByte(); + } + + @Ensures("result != null") + public BaseIndex baseIndexWithMostProbability() { + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (final BaseIndex i : BaseIndex.values()) { + if (sumQuals[i.index] > sumQuals[maxI.index]) + maxI = i; + } + return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCounts()); + } + + @Ensures("result != null") + public BaseIndex baseIndexWithMostProbabilityWithoutIndels() { + BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; + for (final BaseIndex i : BaseIndex.values()) { + if (i.isNucleotide() && sumQuals[i.index] > sumQuals[maxI.index]) + maxI = i; + } + return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCountsWithoutIndels()); } @Ensures("result >=0") public int totalCountWithoutIndels() { - int sum = 0; - for (BaseIndex index : counts.keySet()) - if (index.isNucleotide()) - sum += counts.get(index); - return sum; + return totalCount - counts[BaseIndex.D.index] - counts[BaseIndex.I.index]; } /** * Calculates the proportional count of a base compared to all other bases except indels (I and D) * - * @param index + * @param base base * @return the proportion of this base over all other bases except indels */ - @Requires("index.isNucleotide()") + @Requires("base.isNucleotide()") @Ensures({"result >=0.0", "result<= 1.0"}) - public double baseCountProportionWithoutIndels(BaseIndex index) { - int total = totalCountWithoutIndels(); - if (total == 0) - return 0.0; - return (double) counts.get(index) / totalCountWithoutIndels(); + public double baseCountProportionWithoutIndels(final BaseIndex base) { + final int total = totalCountWithoutIndels(); + return (total == 0) ? 0.0 : (double)counts[base.index] / (double)total; + } + + public int[] countsArray() { + return counts.clone(); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java index a64db5874..02f867bcb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseIndex.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + /** * Simple byte / base index conversions * @@ -56,7 +58,7 @@ public enum BaseIndex { case 'N': case 'n': return N; - default: return null; + default: throw new ReviewedStingException("Tried to create a byte index for an impossible base " + base); } } @@ -68,7 +70,7 @@ public enum BaseIndex { * @return whether or not it is a nucleotide, given the definition above */ public boolean isNucleotide() { - return this == A || this == C || this == G || this == T || this == N; + return !isIndel(); } /** diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 6b92046de..3097c2ee9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import java.util.Arrays; import java.util.LinkedList; /** @@ -156,11 +157,9 @@ public class HeaderElement { * @return whether or not the HeaderElement is variant due to excess insertions */ private boolean isVariantFromInsertions(double minIndelProportion) { - int numberOfBases = consensusBaseCounts.totalCount(); - if (numberOfBases == 0 && insertionsToTheRight > 0) - return true; // we only have insertions - else if (numberOfBases == 0) - return false; // we don't have anything + final int numberOfBases = consensusBaseCounts.totalCount(); + if (numberOfBases == 0) + return (insertionsToTheRight > 0); // do we only have insertions? // if we have bases and insertions, check the ratio return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion; @@ -181,7 +180,7 @@ public class HeaderElement { * @return whether or not the HeaderElement is variant due to excess insertions */ private boolean isVariantFromMismatches(double minVariantProportion) { - BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels(); + BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon); return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion); } @@ -200,5 +199,28 @@ public class HeaderElement { return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual; } + /** + * Calculates the number of haplotypes necessary to represent this site. + * + * @param minVariantProportion the minimum proportion to call a site variant. + * @return the number of haplotypes necessary to represent this site. + */ + public int getNumberOfHaplotypes(double minVariantProportion) { + int nHaplotypes = 0; + int totalCount = consensusBaseCounts.totalCount(); + int runningCount = 0; + if (totalCount == 0) + return 0; + + int[] countsArray = consensusBaseCounts.countsArray(); + Arrays.sort(countsArray); + for (int i = countsArray.length-1; i>=0; i--) { + nHaplotypes++; + runningCount += countsArray[i]; + if (runningCount/totalCount > minVariantProportion) + break; + } + return nHaplotypes; + } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 44971ca38..7c9fc101b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -53,11 +53,13 @@ public class MultiSampleCompressor implements Compressor { final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, - final ReduceReads.DownsampleStrategy downsampleStrategy) { + final ReduceReads.DownsampleStrategy downsampleStrategy, + final int nContigs, + final boolean allowPolyploidReduction) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, - new SingleSampleCompressor(name, contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + new SingleSampleCompressor(contextSize, downsampleCoverage, + minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 177050667..5810bc94f 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.ReadFilters; @@ -52,23 +52,23 @@ import java.util.*; /** * Reduces the BAM file using read based compression that keeps only essential information for variant calling - *

+ * *

* This walker will generated reduced versions of the BAM files that still follow the BAM spec * and contain all the information necessary for the GSA variant calling pipeline. Some options * allow you to tune in how much compression you want to achieve. The default values have been * shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the * savings in file size and performance of the downstream tools. - *

+ * *

Input

*

* The BAM file to be compressed *

- *

+ * *

Output

*

* The compressed (reduced) BAM file. - *

+ * *

*

Examples

*
@@ -86,13 +86,13 @@ import java.util.*;
 public class ReduceReads extends ReadWalker, ReduceReadsStash> {
 
     @Output
-    protected StingSAMFileWriter out;
+    private StingSAMFileWriter out;
 
     /**
      * The number of bases to keep around mismatches (potential variation)
      */
     @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
-    protected int contextSize = 10;
+    private int contextSize = 10;
 
     /**
      * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
@@ -100,7 +100,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
      * towards variable regions.
      */
     @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
-    protected int minMappingQuality = 20;
+    private int minMappingQuality = 20;
 
     /**
      * The minimum base quality to be considered for the consensus synthetic read. Reads that have
@@ -108,35 +108,41 @@ public class ReduceReads extends ReadWalker, ReduceRea
      * towards variable regions.
      */
     @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
-    protected byte minBaseQual = 20;
+    private byte minBaseQual = 20;
 
     /**
      * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
      * lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
      */
     @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
-    protected byte minTailQuality = 2;
+    private byte minTailQuality = 2;
+
+    /**
+     * Allow the experimental polyploid-based reduction capabilities of this tool
+     */
+    @Argument(fullName = "allow_polyploid_reduction", shortName = "polyploid", doc = "", required = false)
+    private boolean USE_POLYPLOID_REDUCTION = false;
 
     /**
      * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
      * and read group).
      */
     @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
-    protected boolean DONT_SIMPLIFY_READS = false;
+    private boolean DONT_SIMPLIFY_READS = false;
 
     /**
      * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
      * The program will behave correctly in those cases.
      */
     @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
-    protected boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
+    private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
 
     /**
      * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
      * quality.
      */
     @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
-    protected boolean DONT_CLIP_LOW_QUAL_TAILS = false;
+    private boolean DONT_CLIP_LOW_QUAL_TAILS = false;
 
     /**
      * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
@@ -144,7 +150,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
      * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
      */
     @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
-    protected boolean DONT_USE_SOFTCLIPPED_BASES = false;
+    private boolean DONT_USE_SOFTCLIPPED_BASES = false;
 
     /**
      * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee 
@@ -152,47 +158,55 @@ public class ReduceReads extends ReadWalker, ReduceRea
      * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. 
      */
     @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
-    protected boolean DONT_COMPRESS_READ_NAMES = false;
+    private boolean DONT_COMPRESS_READ_NAMES = false;
 
     /**
      * Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
      * border.
      */
     @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
-    protected boolean HARD_CLIP_TO_INTERVAL = false;
+    private boolean HARD_CLIP_TO_INTERVAL = false;
 
     /**
      * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
      * considered consensus.
      */
     @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
-    protected double minAltProportionToTriggerVariant = 0.05;
+    private double minAltProportionToTriggerVariant = 0.05;
 
     /**
      * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
      * considered consensus.
      */
     @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
-    protected double minIndelProportionToTriggerVariant = 0.05;
+    private double minIndelProportionToTriggerVariant = 0.05;
 
     /**
      * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
      * A value of 0 turns downsampling off.
      */
     @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
-    protected int downsampleCoverage = 250;
+    private int downsampleCoverage = 250;
+
+    /**
+     * Number of chromossomes in the sample (this is used for the polyploid consensus compression). Only
+     * tested for humans (or organisms with n=2). Use at your own risk!
+     */
+    @Hidden
+    @Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false)
+    private int nContigs = 2;
 
     @Hidden
     @Argument(fullName = "", shortName = "dl", doc = "", required = false)
-    protected int debugLevel = 0;
+    private int debugLevel = 0;
 
     @Hidden
     @Argument(fullName = "", shortName = "dr", doc = "", required = false)
-    protected String debugRead = "";
+    private String debugRead = "";
 
     @Hidden
     @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
-    protected DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
+    private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
     
     @Hidden 
     @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
@@ -203,7 +217,6 @@ public class ReduceReads extends ReadWalker, ReduceRea
         Adaptive
     }
     
-    protected int totalReads = 0;
     int nCompressedReads = 0;
 
     HashMap readNameHash;                                     // This hash will keep the name of the original read the new compressed name (a number).
@@ -247,16 +260,15 @@ public class ReduceReads extends ReadWalker, ReduceRea
      * @return a linked list with all the reads produced by the clipping operations
      */
     @Override
-    public LinkedList map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
+    public LinkedList map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
         LinkedList mappedReads;
-        totalReads++;
         if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
                 System.out.println("Found debug read!");
 
         if (debugLevel == 1)
             System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd());
 
-        // we write the actual alignment starts to their respectiv alignment shift tags in the temporary
+        // we write the actual alignment starts to their respective alignment shift tags in the temporary
         // attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file
         read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart());
         read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd());
@@ -316,7 +328,7 @@ public class ReduceReads extends ReadWalker, ReduceRea
      */
     @Override
     public ReduceReadsStash reduceInit() {
-        return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
+        return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION));
     }
 
     /**
@@ -532,8 +544,6 @@ public class ReduceReads extends ReadWalker, ReduceRea
                 read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift);               // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start)
             if (endShift > 0)
                 read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift);                   // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end)
-            
-            totalReads++;
         }
 
         if (debugLevel == 1)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
index 6d2c2d215..6a086c53b 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java
@@ -1,6 +1,5 @@
 package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
 
-import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 
@@ -8,35 +7,33 @@ import java.util.TreeSet;
 
 /**
  *
- * @author depristo
- * @version 0.1
+ * @author carneiro, depristo
+ * @version 3.0
  */
 public class SingleSampleCompressor implements Compressor {
-    protected static final Logger logger = Logger.getLogger(SingleSampleCompressor.class);
+    final private int contextSize;
+    final private int downsampleCoverage;
+    final private int minMappingQuality;
+    final private double minAltProportionToTriggerVariant;
+    final private double minIndelProportionToTriggerVariant;
+    final private int minBaseQual;
+    final private ReduceReads.DownsampleStrategy downsampleStrategy;
+    final private int nContigs;
+    final private boolean allowPolyploidReduction;
 
-    protected final int contextSize;
-    protected final int downsampleCoverage;
-    protected int minMappingQuality;
-    protected int slidingWindowCounter;
+    private SlidingWindow slidingWindow;
+    private int slidingWindowCounter;
 
-    protected final String sampleName;
 
-    protected SlidingWindow slidingWindow;
-    protected double minAltProportionToTriggerVariant;
-    protected double minIndelProportionToTriggerVariant;
-    protected int minBaseQual;
-
-    protected ReduceReads.DownsampleStrategy downsampleStrategy;
-
-    public SingleSampleCompressor(final String sampleName,
-                                  final int contextSize,
+    public SingleSampleCompressor(final int contextSize,
                                   final int downsampleCoverage,
                                   final int minMappingQuality,
                                   final double minAltProportionToTriggerVariant,
                                   final double minIndelProportionToTriggerVariant,
                                   final int minBaseQual,
-                                  final ReduceReads.DownsampleStrategy downsampleStrategy) {
-        this.sampleName = sampleName;
+                                  final ReduceReads.DownsampleStrategy downsampleStrategy,
+                                  final int nContigs,
+                                  final boolean allowPolyploidReduction) {
         this.contextSize = contextSize;
         this.downsampleCoverage = downsampleCoverage;
         this.minMappingQuality = minMappingQuality;
@@ -45,6 +42,8 @@ public class SingleSampleCompressor implements Compressor {
         this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
         this.minBaseQual = minBaseQual;
         this.downsampleStrategy = downsampleStrategy;
+        this.nContigs = nContigs;
+        this.allowPolyploidReduction = allowPolyploidReduction;
     }
 
     /**
@@ -66,7 +65,7 @@ public class SingleSampleCompressor implements Compressor {
         }
 
         if ( slidingWindow == null) {                                                  // this is the first read
-            slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
+            slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction);
             slidingWindowCounter++;
         }
 
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
index 7173e6c70..32abe8ef6 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java
@@ -8,14 +8,12 @@ import net.sf.samtools.SAMFileHeader;
 import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.recalibration.EventType;
 import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
 
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.ListIterator;
+import java.util.*;
 
 /**
  * Created by IntelliJ IDEA.
@@ -26,13 +24,12 @@ import java.util.ListIterator;
 public class SlidingWindow {
 
     // Sliding Window data
-    final private LinkedList readsInWindow;
+    final private TreeSet readsInWindow;
     final private LinkedList windowHeader;
     protected int contextSize;                                                                                          // the largest context size (between mismatches and indels)
-    protected int stopLocation;
     protected String contig;
     protected int contigIndex;
-    protected SAMFileHeader header;
+    protected SAMFileHeader samHeader;
     protected GATKSAMReadGroupRecord readGroupAttribute;
     protected int downsampleCoverage;
 
@@ -56,6 +53,10 @@ public class SlidingWindow {
     protected ReduceReads.DownsampleStrategy downsampleStrategy;
     private boolean hasIndelQualities;
 
+    private final int nContigs;
+
+    private boolean allowPolyploidReductionInGeneral;
+
     /**
      * The types of synthetic reads to use in the finalizeAndAdd method
      */
@@ -66,7 +67,11 @@ public class SlidingWindow {
     }
 
     public int getStopLocation() {
-        return stopLocation;
+        return getStopLocation(windowHeader);
+    }
+
+    private int getStopLocation(LinkedList header) {
+        return getStartLocation(header) + header.size() - 1;
     }
 
     public String getContig() {
@@ -77,13 +82,12 @@ public class SlidingWindow {
         return contigIndex;
     }
 
-    public int getStartLocation() {
-        return windowHeader.isEmpty() ? -1 : windowHeader.peek().getLocation();
+    public int getStartLocation(LinkedList header) {
+        return header.isEmpty() ? -1 : header.peek().getLocation();
     }
 
 
-    public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) {
-        this.stopLocation = -1;
+    public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) {
         this.contextSize = contextSize;
         this.downsampleCoverage = downsampleCoverage;
 
@@ -93,11 +97,17 @@ public class SlidingWindow {
         this.MIN_MAPPING_QUALITY = minMappingQuality;
 
         this.windowHeader = new LinkedList();
-        this.readsInWindow = new LinkedList();
+        this.readsInWindow = new TreeSet(new Comparator() {
+            @Override
+            public int compare(GATKSAMRecord read1, GATKSAMRecord read2) {
+                final int difference = read1.getSoftEnd() - read2.getSoftEnd();
+                return difference != 0 ? difference : read1.getReadName().compareTo(read2.getReadName());
+            }
+        });
 
         this.contig = contig;
         this.contigIndex = contigIndex;
-        this.header = header;
+        this.samHeader = samHeader;
         this.readGroupAttribute = readGroupAttribute;
 
         this.consensusCounter = 0;
@@ -111,6 +121,9 @@ public class SlidingWindow {
         
         this.downsampleStrategy = downsampleStrategy;
         this.hasIndelQualities = hasIndelQualities;
+        this.nContigs = nContigs;
+
+        this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
     }
 
     /**
@@ -125,7 +138,7 @@ public class SlidingWindow {
      * @return a list of reads that have been finished by sliding the window.
      */
     public List addRead(GATKSAMRecord read) {
-        updateHeaderCounts(read, false);                                                                                // update the window header counts
+        addToHeader(windowHeader, read);                                                                                // update the window header counts
         readsInWindow.add(read);                                                                                        // add read to sliding reads
         return slideWindow(read.getUnclippedStart());
     }
@@ -188,54 +201,105 @@ public class SlidingWindow {
      * @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
      * @return all reads that have fallen to the left of the sliding window after the slide
      */
-    protected List slideWindow(int incomingReadUnclippedStart) {
+    protected List slideWindow(final int incomingReadUnclippedStart) {
         List finalizedReads = new LinkedList();
 
-        if (incomingReadUnclippedStart - contextSize > getStartLocation()) {
-            int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation();
-            boolean[] variantSite = markSites(getStartLocation() + readStartHeaderIndex);
+        final int windowHeaderStartLocation = getStartLocation(windowHeader);
+
+        if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
+            markSites(incomingReadUnclippedStart);
+            int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
             int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0);                                       // this is the limit of what we can close/send to consensus (non-inclusive)
 
-            List> regions = getAllVariantRegions(0, breakpoint, variantSite);
+            List> regions = getAllVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet());
             finalizedReads = closeVariantRegions(regions, false);
 
-            List readsToRemove = new LinkedList();
-            for (GATKSAMRecord read : readsInWindow) {                                                                  // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
-                if (read.getAlignmentEnd() < getStartLocation()) {
-                    readsToRemove.add(read);
-                }
-            }
-            for (GATKSAMRecord read : readsToRemove) {
-                readsInWindow.remove(read);
+            while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
+                readsInWindow.pollFirst();
             }
         }
 
         return finalizedReads;
     }
 
+
+    private final class MarkedSites {
+
+        private boolean[] siteIsVariant = new boolean[0];
+        private int startLocation = 0;
+
+        public MarkedSites() {}
+
+        public boolean[] getVariantSiteBitSet() { return siteIsVariant; }
+
+        /**
+         * Updates the variant site bitset given the new startlocation and size of the region to mark.
+         *
+         * @param newStartLocation   the new start location of the bitset
+         * @param sizeOfRegion       the new size of the region to be represented
+         *
+         * @return the end position (newStartLocation + index) of the region marked by this method; the calling method is responsible for the remainder.
+         */
+        public int updateRegion(final int newStartLocation, final int sizeOfRegion) {
+            int lastPositionMarked = sizeOfRegion;
+
+            // if this is the first time we set the array and we can't reuse anything, just create a new array from scratch
+            if ( newStartLocation >= this.startLocation + siteIsVariant.length || newStartLocation < this.startLocation ) {
+                siteIsVariant = new boolean[sizeOfRegion];
+                lastPositionMarked = 0;
+            }
+            // if the dimensions change, copy what we can and continue
+            else if ( newStartLocation != this.startLocation || sizeOfRegion != siteIsVariant.length ) {
+                final boolean[] tempArray = new boolean[sizeOfRegion];
+                final int differenceInStartPositions = newStartLocation - this.startLocation;
+                lastPositionMarked = Math.min(siteIsVariant.length - differenceInStartPositions, sizeOfRegion);
+                System.arraycopy(siteIsVariant, differenceInStartPositions, tempArray, 0, lastPositionMarked);
+                siteIsVariant = null;   // explicitly allow garbage collection
+                siteIsVariant = tempArray;
+            }
+
+            this.startLocation = newStartLocation;
+
+            return lastPositionMarked + newStartLocation;
+        }
+    }
+
+    private final MarkedSites markedSites = new MarkedSites();
+
     /**
      * returns an array marked with variant and non-variant regions (it uses
      * markVariantRegions to make the marks)
      *
      * @param stop check the window from start to stop (not-inclusive)
-     * @return a boolean array with 'true' marking variant regions and false marking consensus sites
      */
-    protected boolean[] markSites(int stop) {
+    protected void markSites(final int stop) {
 
-        boolean[] markedSites = new boolean[stop - getStartLocation() + contextSize + 1];
+        final int windowHeaderStartLocation = getStartLocation(windowHeader);
+        final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1;
 
+        // copy over as many bits as we can from the previous calculation.  Note that we can't trust the
+        // last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there.
+        final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1;
+        final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize);
+
+        // update the iterator to the correct position
         Iterator headerElementIterator = windowHeader.iterator();
-        for (int i = getStartLocation(); i < stop; i++) {
+        for (int i = windowHeaderStartLocation; i < locationToProcess; i++) {
+            if (headerElementIterator.hasNext())
+                headerElementIterator.next();
+        }
+
+        // process a contextSize worth of region from scratch in case there's a variant there
+        for (int i = locationToProcess; i < stop; i++) {
             if (headerElementIterator.hasNext()) {
                 HeaderElement headerElement = headerElementIterator.next();
 
                 if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
-                    markVariantRegion(markedSites, i - getStartLocation());
+                    markVariantRegion(markedSites, i - windowHeaderStartLocation);
 
             } else
                 break;
         }
-        return markedSites;
     }
 
     /**
@@ -244,11 +308,11 @@ public class SlidingWindow {
      * @param markedSites         the boolean array to bear the marks
      * @param variantSiteLocation the location where a variant site was found
      */
-    protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) {
+    protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) {
         int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
-        int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1;
+        int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1;
         for (int i = from; i < to; i++)
-            markedSites[i] = true;
+            markedSites.getVariantSiteBitSet()[i] = true;
     }
 
     /**
@@ -260,46 +324,45 @@ public class SlidingWindow {
      * @param end   the first header index NOT TO add to consensus
      * @return a list of consensus reads generated by this call. Empty list if no consensus was generated.
      */
-    protected List addToSyntheticReads(int start, int end) {
+    protected List addToSyntheticReads(LinkedList header, int start, int end, boolean isNegativeStrand) {
         LinkedList reads = new LinkedList();
         if (start < end) {
-
-            ListIterator headerElementIterator = windowHeader.listIterator(start);
+            ListIterator headerElementIterator = header.listIterator(start);
 
             if (!headerElementIterator.hasNext())
-                throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d  - %d / %d", start, windowHeader.size(), end));
+                throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d  - %d / %d", start, header.size(), end));
 
             HeaderElement headerElement = headerElementIterator.next();
 
             if (headerElement.hasConsensusData()) {
                 reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
 
-                int endOfConsensus = findNextNonConsensusElement(start, end);
-                addToRunningConsensus(start, endOfConsensus);
+                int endOfConsensus = findNextNonConsensusElement(header, start, end);
+                addToRunningConsensus(header, start, endOfConsensus, isNegativeStrand);
 
                 if (endOfConsensus <= start)
                     throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
 
-                reads.addAll(addToSyntheticReads(endOfConsensus, end));
+                reads.addAll(addToSyntheticReads(header, endOfConsensus, end, isNegativeStrand));
             } else if (headerElement.hasFilteredData()) {
                 reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
 
-                int endOfFilteredData = findNextNonFilteredDataElement(start, end);
-                addToFilteredData(start, endOfFilteredData);
+                int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
+                reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
 
                 if (endOfFilteredData <= start)
                     throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
 
-                reads.addAll(addToSyntheticReads(endOfFilteredData, end));
+                reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, isNegativeStrand));
             } else if (headerElement.isEmpty()) {
                 reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
 
-                int endOfEmptyData = findNextNonEmptyElement(start, end);
+                int endOfEmptyData = findNextNonEmptyElement(header, start, end);
 
                 if (endOfEmptyData <= start)
                     throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
 
-                reads.addAll(addToSyntheticReads(endOfEmptyData, end));
+                reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, isNegativeStrand));
             } else
                 throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
 
@@ -343,8 +406,8 @@ public class SlidingWindow {
      * @param upTo  limit to search for another consensus element
      * @return next position with consensus data or empty
      */
-    private int findNextNonConsensusElement(int start, int upTo) {
-        Iterator headerElementIterator = windowHeader.listIterator(start);
+    private int findNextNonConsensusElement(LinkedList header, int start, int upTo) {
+        Iterator headerElementIterator = header.listIterator(start);
         int index = start;
         while (index < upTo) {
             if (!headerElementIterator.hasNext())
@@ -365,8 +428,8 @@ public class SlidingWindow {
      * @param upTo  limit to search for
      * @return next position with no filtered data
      */
-    private int findNextNonFilteredDataElement(int start, int upTo) {
-        Iterator headerElementIterator = windowHeader.listIterator(start);
+    private int findNextNonFilteredDataElement(LinkedList header, int start, int upTo) {
+        Iterator headerElementIterator = header.listIterator(start);
         int index = start;
         while (index < upTo) {
             if (!headerElementIterator.hasNext())
@@ -387,8 +450,8 @@ public class SlidingWindow {
      * @param upTo  limit to search for
      * @return next position with non-empty element
      */
-    private int findNextNonEmptyElement(int start, int upTo) {
-        ListIterator headerElementIterator = windowHeader.listIterator(start);
+    private int findNextNonEmptyElement(LinkedList header, int start, int upTo) {
+        ListIterator headerElementIterator = header.listIterator(start);
         int index = start;
         while (index < upTo) {
             if (!headerElementIterator.hasNext())
@@ -412,11 +475,13 @@ public class SlidingWindow {
      * @param start the first header index to add to consensus
      * @param end   the first header index NOT TO add to consensus
      */
-    private void addToFilteredData(int start, int end) {
-        if (filteredDataConsensus == null)
-            filteredDataConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
+    private List addToFilteredData(LinkedList header, int start, int end, boolean isNegativeStrand) {
+        List result = new ArrayList(0);
 
-        ListIterator headerElementIterator = windowHeader.listIterator(start);
+        if (filteredDataConsensus == null)
+            filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
+
+        ListIterator headerElementIterator = header.listIterator(start);
         for (int index = start; index < end; index++) {
             if (!headerElementIterator.hasNext())
                 throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist");
@@ -428,8 +493,15 @@ public class SlidingWindow {
             if (!headerElement.hasFilteredData())
                 throw new ReviewedStingException("No filtered data in " + index);
 
+            if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
+                result.add(finalizeFilteredDataConsensus());
+                filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
+            }
+
             genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
         }
+
+        return result;
     }
 
     /**
@@ -441,11 +513,11 @@ public class SlidingWindow {
      * @param start the first header index to add to consensus
      * @param end   the first header index NOT TO add to consensus
      */
-    private void addToRunningConsensus(int start, int end) {
+    private void addToRunningConsensus(LinkedList header, int start, int end, boolean isNegativeStrand) {
         if (runningConsensus == null)
-            runningConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
+            runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
 
-        Iterator headerElementIterator = windowHeader.listIterator(start);
+        Iterator headerElementIterator = header.listIterator(start);
         for (int index = start; index < end; index++) {
             if (!headerElementIterator.hasNext())
                 throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
@@ -466,14 +538,76 @@ public class SlidingWindow {
      * @param rms           the rms mapping quality in the header element
      */
     private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
-        BaseIndex base = baseCounts.baseIndexWithMostCounts();
-        byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
-        byte qual = baseCounts.averageQualsOfMostCommonBase();
-        byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
-        byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
+        final BaseIndex base = baseCounts.baseIndexWithMostProbability();
+        byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
+        byte qual = baseCounts.averageQualsOfBase(base);
+        byte insQual = baseCounts.averageInsertionQualsOfBase(base);
+        byte delQual = baseCounts.averageDeletionQualsOfBase(base);
         syntheticRead.add(base, count, qual, insQual, delQual, rms);
     }
 
+    private List compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
+        List allReads = new LinkedList();
+
+        // Try to compress into a polyploid consensus
+        int nHaplotypes = 0;
+        int hetRefPosition = -1;
+        boolean canCompress = true;
+        boolean foundEvent = false;
+        Object[] header = windowHeader.toArray();
+
+        // foundEvent will remain false if we don't allow polyploid reduction
+        if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
+            for (int i = start; i<=stop; i++) {
+                nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
+                if (nHaplotypes > nContigs) {
+                    canCompress = false;
+                    break;
+                }
+
+                // guarantees that there is only 1 site in the variant region that needs more than one haplotype
+                if (nHaplotypes > 1) {
+                    if (!foundEvent) {
+                        foundEvent = true;
+                        hetRefPosition = i;
+                    }
+                    else {
+                        canCompress = false;
+                        break;
+                    }
+                }
+            }
+        }
+
+        // Try to compress the variant region
+        // the "foundEvent" protects us from trying to compress variant regions that are created by insertions
+        if (canCompress && foundEvent) {
+            allReads = createPolyploidConsensus(start, stop, nHaplotypes, ((HeaderElement) header[hetRefPosition]).getLocation());
+        }
+
+        // Return all reads that overlap the variant region and remove them from the window header entirely
+        // also remove all reads preceding the variant region (since they will be output as consensus right after compression
+        else {
+            final int refStart = windowHeader.get(start).getLocation();
+            final int refStop = windowHeader.get(stop).getLocation();
+
+            LinkedList toRemove = new LinkedList();
+            for (GATKSAMRecord read : readsInWindow) {
+                if (read.getSoftStart() <= refStop) {
+                    if (read.getAlignmentEnd() >= refStart) {
+                        allReads.add(read);
+                        removeFromHeader(windowHeader, read);
+                    }
+                    toRemove.add(read);
+                }
+            }
+            for (GATKSAMRecord read : toRemove) {
+                readsInWindow.remove(read);
+            }
+        }
+        return allReads;
+    }
+
     /**
      * Finalizes a variant region, any adjacent synthetic reads.
      *
@@ -482,27 +616,13 @@ public class SlidingWindow {
      * @return all reads contained in the variant region plus any adjacent synthetic reads
      */
     @Requires("start <= stop")
-    protected List closeVariantRegion(int start, int stop) {
-        List allReads = new LinkedList();
-
-        int refStart = windowHeader.get(start).getLocation();                                                           // All operations are reference based, not read based
-        int refStop = windowHeader.get(stop).getLocation();
-
-        for (GATKSAMRecord read : readsInWindow) {                                                                      // Keep all reads that overlap the variant region
-            if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) {
-                allReads.add(read);
-                updateHeaderCounts(read, true);                                                                         // Remove this read from the window header entirely
-            }
-        }
+    protected List closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
+        List allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
 
         List result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
-        result.addAll(addToSyntheticReads(0, start));
+        result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
         result.addAll(finalizeAndAdd(ConsensusType.BOTH));
 
-        for (GATKSAMRecord read : allReads) {
-            readsInWindow.remove(read);                                                                                 // todo -- not optimal, but needs to be done so the next region doesn't try to remove the same reads from the header counts.
-        }
-
         return result;                                                                                                  // finalized reads will be downsampled if necessary
     }
 
@@ -517,7 +637,7 @@ public class SlidingWindow {
                 if (stop < 0 && forceClose)
                     stop = windowHeader.size() - 1;
                 if (stop >= 0) {
-                    allReads.addAll(closeVariantRegion(start, stop));
+                    allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
                     lastStop = stop;
                 }
             }
@@ -545,7 +665,7 @@ public class SlidingWindow {
 
         ReservoirDownsampler  downsampler = new ReservoirDownsampler(downsampleCoverage);
         downsampler.submit(allReads);
-        return downsampler.consumeDownsampledItems();
+        return downsampler.consumeFinalizedItems();
     }
 
 
@@ -561,16 +681,17 @@ public class SlidingWindow {
         List finalizedReads = new LinkedList();
 
         if (!windowHeader.isEmpty()) {
-            boolean[] variantSite = markSites(stopLocation + 1);
-            List> regions = getAllVariantRegions(0, windowHeader.size(), variantSite);
+            markSites(getStopLocation(windowHeader) + 1);
+            List> regions = getAllVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet());
             finalizedReads = closeVariantRegions(regions, true);
 
             if (!windowHeader.isEmpty()) {
-                finalizedReads.addAll(addToSyntheticReads(0, windowHeader.size() - 1));
+                finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
                 finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH));                                              // if it ended in running consensus, finish it up
             }
 
         }
+
         return finalizedReads;
     }
 
@@ -611,13 +732,96 @@ public class SlidingWindow {
     }
 
 
+
+    private List createPolyploidConsensus(int start, int stop, int nHaplotypes, int hetRefPosition) {
+        // we will create two (positive strand, negative strand) headers for each contig
+        List> headersPosStrand = new ArrayList>();
+        List> headersNegStrand = new ArrayList>();
+        List hetReads = new LinkedList();
+        Map haplotypeHeaderMap = new HashMap(nHaplotypes);
+        int currentHaplotype = 0;
+        int refStart = windowHeader.get(start).getLocation();
+        int refStop = windowHeader.get(stop).getLocation();
+        List toRemove = new LinkedList();
+        for (GATKSAMRecord read : readsInWindow) {
+            int haplotype;
+
+            // check if the read is either before or inside the variant region
+            if (read.getSoftStart() <= refStop) {
+                // check if the read is inside the variant region
+                if (read.getMappingQuality() >= MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) {
+                    // check if the read contains the het site
+                    if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) {
+                        int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
+                        byte base = read.getReadBases()[readPos];
+                        byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos];
+
+                        // check if base passes the filters!
+                        if (qual >= MIN_BASE_QUAL_TO_COUNT) {
+                            // check which haplotype this read represents and take the index of it from the list of headers
+                            if (haplotypeHeaderMap.containsKey(base)) {
+                                haplotype = haplotypeHeaderMap.get(base);
+                            }
+                            // create new lists if this haplotype has not been seen yet
+                            else {
+                                haplotype = currentHaplotype;
+                                haplotypeHeaderMap.put(base, currentHaplotype);
+                                headersPosStrand.add(new LinkedList());
+                                headersNegStrand.add(new LinkedList());
+                                currentHaplotype++;
+                            }
+                            LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype);
+                            // add to the polyploid header
+                            addToHeader(header, read);
+                            // remove from the standard header so that we don't double count it
+                            removeFromHeader(windowHeader, read);
+                        }
+                    }
+                }
+
+                // we remove all reads before and inside the variant region from the window
+                toRemove.add(read);
+            }
+        }
+
+        for (LinkedList header : headersPosStrand) {
+            if (header.size() > 0)
+                hetReads.addAll(addToSyntheticReads(header, 0, header.size(), false));
+            if (runningConsensus != null)
+                hetReads.add(finalizeRunningConsensus());
+        }
+        for (LinkedList header : headersNegStrand) {
+            if (header.size() > 0)
+                hetReads.addAll(addToSyntheticReads(header, 0, header.size(), true));
+            if (runningConsensus != null)
+                hetReads.add(finalizeRunningConsensus());
+        }
+
+        for (GATKSAMRecord read : toRemove) {
+            readsInWindow.remove(read);
+        }
+        return hetReads;
+    }
+
+
+    private void addToHeader(LinkedList header, GATKSAMRecord read) {
+        updateHeaderCounts(header, read, false);
+    }
+
+    private void removeFromHeader(LinkedList header, GATKSAMRecord read) {
+        updateHeaderCounts(header, read, true);
+    }
+
+
     /**
      * Updates the sliding window's header counts with the incoming read bases, insertions
      * and deletions.
      *
+     * @param header the sliding window header to use
      * @param read the incoming read to be added to the sliding window
+     * @param removeRead if we are removing the read from the header or adding
      */
-    protected void updateHeaderCounts(GATKSAMRecord read, boolean removeRead) {
+    private void updateHeaderCounts(LinkedList header, GATKSAMRecord read, boolean removeRead) {
         byte[] bases = read.getReadBases();
         byte[] quals = read.getBaseQualities();
         byte[] insQuals = read.getExistingBaseInsertionQualities();
@@ -627,8 +831,9 @@ public class SlidingWindow {
         Cigar cigar = read.getCigar();
 
         int readBaseIndex = 0;
-        int startLocation = getStartLocation();
+        int startLocation = getStartLocation(header);
         int locationIndex = startLocation < 0 ? 0 : readStart - startLocation;
+        int stopLocation = getStopLocation(header);
 
         if (removeRead && locationIndex < 0)
             throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " start " + read.getUnclippedStart() + "," + read.getUnclippedEnd() + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation);
@@ -636,7 +841,7 @@ public class SlidingWindow {
         if (!removeRead) {                                                                                              // we only need to create new header elements if we are adding the read, not when we're removing it
             if (locationIndex < 0) {                                                                                    // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window
                 for (int i = 1; i <= -locationIndex; i++)
-                    windowHeader.addFirst(new HeaderElement(startLocation - i));
+                    header.addFirst(new HeaderElement(startLocation - i));
 
                 startLocation = readStart;                                                               // update start location accordingly
                 locationIndex = 0;
@@ -645,19 +850,17 @@ public class SlidingWindow {
             if (stopLocation < readEnd) {                                                                // Do we need to add extra elements to the header?
                 int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation;
                 while (elementsToAdd-- > 0)
-                    windowHeader.addLast(new HeaderElement(readEnd - elementsToAdd));
-
-                stopLocation = readEnd;                                                                  // update stopLocation accordingly
+                    header.addLast(new HeaderElement(readEnd - elementsToAdd));
             }
 
             // Special case for leading insertions before the beginning of the sliding read
             if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) {
-                windowHeader.addFirst(new HeaderElement(readStart - 1));                                 // create a new first element to the window header with no bases added
+                header.addFirst(new HeaderElement(readStart - 1));                                 // create a new first element to the window header with no bases added
                 locationIndex = 1;                                                                                      // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing.
             }
         }
 
-        Iterator headerElementIterator = windowHeader.listIterator(locationIndex);
+        Iterator headerElementIterator = header.listIterator(locationIndex);
         HeaderElement headerElement;
         for (CigarElement cigarElement : cigar.getCigarElements()) {
             switch (cigarElement.getOperator()) {
@@ -668,7 +871,7 @@ public class SlidingWindow {
                         break;
                     }
 
-                    headerElement = windowHeader.get(locationIndex - 1);                                                // insertions are added to the base to the left (previous element)
+                    headerElement = header.get(locationIndex - 1);                                                // insertions are added to the base to the left (previous element)
 
                     if (removeRead) {
                         headerElement.removeInsertionToTheRight();
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
index 6134101d9..ccf81dd67 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticRead.java
@@ -5,9 +5,9 @@ import net.sf.samtools.Cigar;
 import net.sf.samtools.CigarElement;
 import net.sf.samtools.CigarOperator;
 import net.sf.samtools.SAMFileHeader;
-import org.broadinstitute.sting.utils.recalibration.EventType;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.recalibration.EventType;
 import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 
@@ -44,8 +44,9 @@ public class SyntheticRead {
     private String contig;
     private int contigIndex;
     private String readName;
-    private Integer refStart;
+    private int refStart;
     private boolean hasIndelQualities = false;
+    private boolean isNegativeStrand = false;
 
     /**
      * Full initialization of the running consensus if you have all the information and are ready to
@@ -59,7 +60,7 @@ public class SyntheticRead {
      * @param refStart        the alignment start (reference based)
      * @param readTag         the reduce reads tag for the synthetic read
      */
-    public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities) {
+    public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
         final int initialCapacity = 10000;
         bases = new ArrayList(initialCapacity);
         counts = new ArrayList(initialCapacity);
@@ -76,9 +77,10 @@ public class SyntheticRead {
         this.readName = readName;
         this.refStart = refStart;
         this.hasIndelQualities = hasIndelQualities;
+        this.isNegativeStrand = isNegativeRead;
     }
 
-    public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities) {
+    public SyntheticRead(List bases, List counts, List quals, List insertionQuals, List deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
         this.bases = bases;
         this.counts = counts;
         this.quals = quals;
@@ -93,6 +95,7 @@ public class SyntheticRead {
         this.readName = readName;
         this.refStart = refStart;
         this.hasIndelQualities = hasIndelQualities;
+        this.isNegativeStrand = isNegativeRead;
     }
 
     /**
@@ -112,11 +115,15 @@ public class SyntheticRead {
         this.mappingQuality += mappingQuality;
     }
 
-    public BaseIndex getBase(int readCoordinate) {
+    public BaseIndex getBase(final int readCoordinate) {
         return bases.get(readCoordinate);
     }
 
-   /**
+    public int getRefStart() {
+        return refStart;
+    }
+
+    /**
      * Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
      *
      * Invalid reads are :
@@ -133,6 +140,7 @@ public class SyntheticRead {
         read.setReferenceIndex(contigIndex);
         read.setReadPairedFlag(false);
         read.setReadUnmappedFlag(false);
+        read.setReadNegativeStrandFlag(isNegativeStrand);
         read.setCigar(buildCigar());                                        // the alignment start may change while building the cigar (leading deletions)
         read.setAlignmentStart(refStart);
         read.setReadName(readName);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
index 26ff4db24..fc6d23382 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ErrorModel.java
@@ -1,11 +1,11 @@
 package org.broadinstitute.sting.gatk.walkers.genotyper;
 
 import com.google.java.contract.Requires;
-import org.apache.commons.lang.ArrayUtils;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
@@ -53,13 +53,14 @@ public class ErrorModel  {
 
         PairHMMIndelErrorModel pairModel = null;
         LinkedHashMap haplotypeMap = null;
-        HashMap> indelLikelihoodMap = null;
         double[][] perReadLikelihoods = null;
 
         double[] model = new double[maxQualityScore+1];
         Arrays.fill(model,Double.NEGATIVE_INFINITY);
 
         boolean hasCalledAlleles = false;
+
+        final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
         if (refSampleVC != null) {
 
             for (Allele allele : refSampleVC.getAlleles()) {
@@ -71,8 +72,7 @@ public class ErrorModel  {
             haplotypeMap = new LinkedHashMap();
             if (refSampleVC.isIndel()) {
                 pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
-                        UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
-                indelLikelihoodMap = new HashMap>();
+                        UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
                 IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
             }
         }
@@ -92,12 +92,12 @@ public class ErrorModel  {
 
             Allele refAllele = refSampleVC.getReference();
 
-            if (refSampleVC.isIndel()) {
+            if ( refSampleVC.isIndel()) {
                 final int readCounts[] = new int[refSamplePileup.getNumberOfElements()];
                 //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
                 final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
                 if (!haplotypeMap.isEmpty())
-                    perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts);
+                    perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap, readCounts);
             }
             int idx = 0;
             for (PileupElement refPileupElement : refSamplePileup) {
@@ -195,8 +195,8 @@ public class ErrorModel  {
         if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength)
             return true;
 
-        if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
-                Arrays.equals(pileupElement.getEventBases().getBytes(),alleleBases))
+                if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
+                Arrays.equals(pileupElement.getEventBases().getBytes(),Arrays.copyOfRange(alleleBases,1,alleleBases.length))) // allele contains ref byte, but pileupElement's event bases doesn't
             return true;
 
         return false;
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
index 6b0831323..303ab94d6 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoods.java
@@ -26,6 +26,8 @@
 package org.broadinstitute.sting.gatk.walkers.genotyper;
 
 import net.sf.samtools.SAMUtils;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACcounts;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
 import org.broadinstitute.sting.utils.collections.Pair;
@@ -123,7 +125,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
      *
      *
      */
-    protected static class SumIterator {
+    public static class SumIterator {
         private int[] currentState;
         private final int[] finalState;
         private final int restrictSumTo;
@@ -491,32 +493,32 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
             // If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
             // and we repeat until queue is empty
             // queue of AC conformations to process
-            final LinkedList ACqueue = new LinkedList();
+            final LinkedList ACqueue = new LinkedList();
             // mapping of ExactACset indexes to the objects
-            final HashMap indexesToACset = new HashMap(likelihoodDim);
+            final HashMap indexesToACset = new HashMap(likelihoodDim);
             // add AC=0 to the queue
             final int[] zeroCounts = new int[nAlleles];
             zeroCounts[0] = numChromosomes;
 
-            AlleleFrequencyCalculationModel.ExactACset zeroSet =
-                    new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
+            ExactACset zeroSet =
+                    new ExactACset(1, new ExactACcounts(zeroCounts));
 
             ACqueue.add(zeroSet);
-            indexesToACset.put(zeroSet.ACcounts, zeroSet);
+            indexesToACset.put(zeroSet.getACcounts(), zeroSet);
 
             // keep processing while we have AC conformations that need to be calculated
             double maxLog10L = Double.NEGATIVE_INFINITY;
             while ( !ACqueue.isEmpty() ) {
                 // compute log10Likelihoods
-                final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
+                final ExactACset ACset = ACqueue.remove();
                 final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
 
                 // adjust max likelihood seen if needed
                 maxLog10L = Math.max(maxLog10L, log10LofKs);
                 // clean up memory
-                indexesToACset.remove(ACset.ACcounts);
+                indexesToACset.remove(ACset.getACcounts());
                 if ( VERBOSE )
-                    System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
+                    System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
 
              }
 
@@ -525,13 +527,13 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
             int plIdx = 0;
             SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
             while (iterator.hasNext()) {
-                AlleleFrequencyCalculationModel.ExactACset ACset =
-                       new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
+                ExactACset ACset =
+                       new ExactACset(1, new ExactACcounts(iterator.getCurrentVector()));
                 // for observed base X, add Q(jX,k) to likelihood vector for all k in error model
                 //likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) +  nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
                 getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
 
-                setLogPLs(plIdx++, ACset.log10Likelihoods[0]);
+                setLogPLs(plIdx++, ACset.getLog10Likelihoods()[0]);
                 iterator.next();
             }
         }
@@ -540,40 +542,40 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
 
     }
 
-    private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
+    private double calculateACConformationAndUpdateQueue(final ExactACset set,
                                                          final ErrorModel errorModel,
                                                          final List alleleList,
                                                          final List numObservations,
                                                          final double  maxLog10L,
-                                                         final LinkedList ACqueue,
-                                                         final HashMap indexesToACset,
+                                                         final LinkedList ACqueue,
+                                                         final HashMap indexesToACset,
                                                          final ReadBackedPileup pileup) {
         // compute likelihood of set
         getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
-        final double log10LofK = set.log10Likelihoods[0];
+        final double log10LofK = set.getLog10Likelihoods()[0];
         
         // log result in PL vector
-        int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes);
+        int idx = getLinearIndex(set.getACcounts().getCounts(), nAlleles, numChromosomes);
         setLogPLs(idx, log10LofK);
 
         // can we abort early because the log10Likelihoods are so small?
         if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
             if ( VERBOSE )
-                System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
+                System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, maxLog10L);
             return log10LofK;
         }
 
         // iterate over higher frequencies if possible
         // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
-        final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0];
+        final int ACwiggle = numChromosomes - set.getACsum() + set.getACcounts().getCounts()[0];
         if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
             return log10LofK;
 
 
         // add conformations for other cases
         for ( int allele = 1; allele < nAlleles; allele++ ) {
-            final int[] ACcountsClone = set.ACcounts.getCounts().clone();
+            final int[] ACcountsClone = set.getACcounts().getCounts().clone();
             ACcountsClone[allele]++;
             // is this a valid conformation?
             int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
@@ -597,7 +599,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
      * @param numObservations Number of observations for each allele
      * @param pileup        Read backed pileup in case it's necessary
      */
-    public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
+    public abstract void getLikelihoodOfConformation(final ExactACset ACset,
                                                      final ErrorModel errorModel,
                                                      final List alleleList,
                                                      final List numObservations,
@@ -608,12 +610,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
 
     // Static methods
     public static void updateACset(final int[] newSetCounts,
-                                    final LinkedList ACqueue,
-                                    final HashMap indexesToACset) {
+                                    final LinkedList ACqueue,
+                                    final HashMap indexesToACset) {
 
-        final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
+        final ExactACcounts index = new ExactACcounts(newSetCounts);
         if ( !indexesToACset.containsKey(index) ) {
-            AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
+            ExactACset newSet = new ExactACset(1, index);
             indexesToACset.put(index, newSet);
             ACqueue.add(newSet);     
             if (VERBOSE)
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
index f6ce818be..f6ad445c7 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyGenotypeLikelihoodsCalculationModel.java
@@ -41,15 +41,6 @@ import java.util.*;
 
 public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
 
-    //protected Set laneIDs;
-    public enum Model {
-        SNP,
-        INDEL,
-        POOLSNP,
-        POOLINDEL,
-        BOTH
-    }
-
     final protected UnifiedArgumentCollection UAC;
 
     protected GeneralPloidyGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
@@ -203,7 +194,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
                                          final AlignmentContextUtils.ReadOrientation contextType,
                                          final List allAllelesToUse,
                                          final boolean useBAQedPileup,
-                                         final GenomeLocParser locParser) {
+                                         final GenomeLocParser locParser,
+                                         final Map perReadAlleleLikelihoodMap) {
 
         HashMap perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts);
         if (perLaneErrorModels == null && UAC.referenceSampleName != null)
@@ -215,8 +207,11 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
             newContext.put(DUMMY_SAMPLE_NAME,mergedContext);
             contexts = newContext;
         }
-
-        // get initial alleles to genotype
+        if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
+            // starting a new site: clear allele list
+            perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods
+        }
+            // get initial alleles to genotype
         final List allAlleles = new ArrayList();
         if (allAllelesToUse == null || allAllelesToUse.isEmpty())
             allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse));
@@ -234,9 +229,13 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
                 continue;
 
             ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
+            if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){
+                // no likelihoods have been computed for this sample at this site
+                perReadAlleleLikelihoodMap.put(sample.getKey(), org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap());
+            }
 
             // create the GenotypeLikelihoods object
-            final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO);
+            final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO, perReadAlleleLikelihoodMap.get(sample.getKey()));
             // actually compute likelihoods
             final int nGoodBases = GL.add(pileup, UAC);
             if ( nGoodBases > 0 )
@@ -246,7 +245,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
 
         // find the alternate allele(s) that we should be using
         final List alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
-        if (alleles == null || alleles.isEmpty())
+        if (alleles == null || alleles.isEmpty() || (alleles.size() == 1 && alleles.get(0).isReference()))
             return null;
         // start making the VariantContext
         final GenomeLoc loc = ref.getLocus();
@@ -333,7 +332,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
                                                                                final HashMap perLaneErrorModels,
                                                                                final boolean useBQAedPileup,
                                                                                final ReferenceContext ref,
-                                                                               final boolean ignoreLaneInformation);
+                                                                               final boolean ignoreLaneInformation,
+                                                                               final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap);
 
     protected abstract List getInitialAllelesToUse(final RefMetaDataTracker tracker,
                                                            final ReferenceContext ref,
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
index 34267b9a8..4bcaa5ff9 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoods.java
@@ -1,6 +1,7 @@
 package org.broadinstitute.sting.gatk.walkers.genotyper;
 
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
 import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
 import org.broadinstitute.sting.utils.Haplotype;
 import org.broadinstitute.sting.utils.MathUtils;
@@ -26,6 +27,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
     double[][] readHaplotypeLikelihoods;
 
     final byte refBase;
+    final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap;
 
     public GeneralPloidyIndelGenotypeLikelihoods(final List alleles,
                                                  final double[] logLikelihoods,
@@ -34,7 +36,8 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
                                                  final boolean ignoreLaneInformation,
                                                  final PairHMMIndelErrorModel pairModel,
                                                  final LinkedHashMap haplotypeMap,
-                                                 final ReferenceContext referenceContext) {
+                                                 final ReferenceContext referenceContext,
+                                                 final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) {
         super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
         this.pairModel = pairModel;
         this.haplotypeMap = haplotypeMap;
@@ -42,6 +45,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
         this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
         // todo - not needed if indel alleles have base at current position
         this.refBase = referenceContext.getBase();
+        this.perReadAlleleLikelihoodMap = perReadAlleleLikelihoodMap;
     }
 
     // -------------------------------------------------------------------------------------
@@ -142,8 +146,9 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
         List numSeenBases = new ArrayList(this.alleles.size());
 
         if (!hasReferenceSampleData) {
+ 
             final int readCounts[] = new int[pileup.getNumberOfElements()];
-            readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
+            readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap, readCounts);
             n = readHaplotypeLikelihoods.length;
         } else {
             Allele refAllele = null;
@@ -184,12 +189,12 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
      * @param alleleList    List of alleles
      * @param numObservations Number of observations for each allele in alleleList
      */
-    public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
+    public void getLikelihoodOfConformation(final ExactACset ACset,
                                             final ErrorModel errorModel,
                                             final List alleleList,
                                             final List numObservations,
                                             final ReadBackedPileup pileup) {
-        final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size());
+        final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size());
         double p1 = 0.0;
 
         if (!hasReferenceSampleData) {
@@ -214,6 +219,6 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
             }
             p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
         }
-        ACset.log10Likelihoods[0] = p1;
+        ACset.getLog10Likelihoods()[0] = p1;
    }
 }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
index f6559f666..eb4cf1839 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyIndelGenotypeLikelihoodsCalculationModel.java
@@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
 
 
         pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
-                UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
+                UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
         haplotypeMap = new LinkedHashMap();
     }
 
@@ -73,8 +73,9 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
                                                                                final HashMap perLaneErrorModels,
                                                                                final boolean useBQAedPileup,
                                                                                final ReferenceContext ref,
-                                                                               final boolean ignoreLaneInformation){
-        return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref);
+                                                                               final boolean ignoreLaneInformation,
+                                                                               final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){
+        return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref, perReadAlleleLikelihoodMap);
     }
 
     protected List getInitialAllelesToUse(final RefMetaDataTracker tracker,
@@ -90,7 +91,6 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
         if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE)
             alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE);
         if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
-            IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear();
             haplotypeMap.clear();
         }
         IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap);
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
index 944372907..0f0f85441 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java
@@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
 
 
 import net.sf.samtools.SAMUtils;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.baq.BAQ;
@@ -12,7 +13,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
 import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
 
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
 
 import static java.lang.Math.log10;
 import static java.lang.Math.pow;
@@ -218,12 +222,12 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
      * @param alleleList    List of alleles
      * @param numObservations Number of observations for each allele in alleleList
       */
-    public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
+    public void getLikelihoodOfConformation(final ExactACset ACset,
                                             final ErrorModel errorModel,
                                             final List alleleList,
                                             final List numObservations,
                                             final ReadBackedPileup pileup) {
-        final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length);
+        final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), BaseUtils.BASES.length);
         final int[] ac = new int[BaseUtils.BASES.length];
         
         for (int k=0; k < BaseUtils.BASES.length; k++ )
@@ -238,9 +242,9 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
                 final byte qual = qualToUse(elt, true, true, mbq);
                 if ( qual == 0 )
                     continue;
-                final double acc[] = new double[ACset.ACcounts.counts.length];
+                final double acc[] = new double[ACset.getACcounts().getCounts().length];
                 for (int k=0; k < acc.length; k++ )
-                    acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]]
+                    acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.getACcounts().getCounts()[k]]
                             - LOG10_PLOIDY;
                 p1 += MathUtils.log10sumLog10(acc);
             }
@@ -264,7 +268,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
     
             p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec);
         }
-        ACset.log10Likelihoods[0] = p1;
+        ACset.getLog10Likelihoods()[0] = p1;
         /*        System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1));
         System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ)));
       */
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
index 30d614455..9f2fdc096 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java
@@ -49,7 +49,8 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General
                                                                                final HashMap perLaneErrorModels,
                                                                                final boolean useBQAedPileup,
                                                                                final ReferenceContext ref,
-                                                                               final boolean ignoreLaneInformation) {
+                                                                               final boolean ignoreLaneInformation,
+                                                                               final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){
         return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO);
     }
 
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java
new file mode 100644
index 000000000..e9ed6b153
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceTest.java
@@ -0,0 +1,315 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
+
+import org.apache.log4j.ConsoleAppender;
+import org.apache.log4j.Logger;
+import org.apache.log4j.TTCCLayout;
+import org.broadinstitute.sting.gatk.report.GATKReport;
+import org.broadinstitute.sting.gatk.report.GATKReportTable;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.SimpleTimer;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.broadinstitute.sting.utils.variantcontext.Genotype;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
+
+import java.io.*;
+import java.util.*;
+
+/**
+ * A simple GATK utility (i.e, runs from command-line) for assessing the performance of
+ * the exact model
+ */
+public class AFCalcPerformanceTest {
+    final static Logger logger = Logger.getLogger(AFCalcPerformanceTest.class);
+
+    private static abstract class Analysis {
+        final GATKReport report;
+
+        public Analysis(final String name, final List columns) {
+            report = GATKReport.newSimpleReport(name, columns);
+        }
+
+        public abstract void run(final AFCalcTestBuilder testBuilder,
+                                 final List coreColumns);
+
+        public String getName() {
+            return getTable().getTableName();
+        }
+
+        public GATKReportTable getTable() {
+            return report.getTables().iterator().next();
+        }
+    }
+
+    private static class AnalyzeByACAndPL extends Analysis {
+        public AnalyzeByACAndPL(final List columns) {
+            super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac"));
+        }
+
+        public void run(final AFCalcTestBuilder testBuilder, final List coreValues) {
+            final SimpleTimer timer = new SimpleTimer();
+
+            for ( final int nonTypePL : Arrays.asList(100) ) {
+                final AFCalc calc = testBuilder.makeModel();
+                final double[] priors = testBuilder.makePriors();
+
+                for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) {
+                    final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
+
+                    timer.start();
+                    final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
+                    final long runtime = timer.getElapsedTimeNano();
+
+                    int otherAC = 0;
+                    int nAltSeg = 0;
+                    for ( int i = 0; i < ACs.length; i++ ) {
+                        nAltSeg += ACs[i] > 0 ? 1 : 0;
+                        if ( i > 0 ) otherAC += ACs[i];
+                    }
+
+                    final List columns = new LinkedList(coreValues);
+                    columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC));
+                    report.addRowList(columns);
+                }
+            }
+        }
+
+        private List makeACs(final int nAltAlleles, final int nChrom) {
+            if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3");
+
+            final List ACs = new LinkedList();
+
+            final List ACsToTry = MathUtils.log10LinearRange(0, nChrom, 0.1); //Arrays.asList(0, 1, 2, 3, 6, 10, 20, 40, 60, 100, 200, 400, 600, 1000, 2000, 4000, 6000, 10000, 100000);
+
+            for ( int i : ACsToTry ) {
+                if ( i < nChrom ) {
+                    if ( nAltAlleles == 1 ) {
+                        ACs.add(new int[]{i});
+                    } else if ( nAltAlleles == 2 ) {
+                        for ( int j : ACsToTry ) {
+                            if ( j < nChrom - i )
+                                ACs.add(new int[]{i, j});
+                        }
+                    } else {
+                        throw new IllegalStateException("cannot get here");
+                    }
+                }
+            }
+
+            return ACs;
+        }
+    }
+
+    private static class AnalyzeBySingletonPosition extends Analysis {
+        public AnalyzeBySingletonPosition(final List columns) {
+            super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton"));
+        }
+
+        public void run(final AFCalcTestBuilder testBuilder, final List coreValues) {
+            final SimpleTimer timer = new SimpleTimer();
+
+            for ( final int nonTypePL : Arrays.asList(100) ) {
+                final AFCalc calc = testBuilder.makeModel();
+                final double[] priors = testBuilder.makePriors();
+
+                final int[] ac = new int[testBuilder.numAltAlleles];
+                ac[0] = 1;
+                final VariantContext vc = testBuilder.makeACTest(ac, 0, nonTypePL);
+
+                for ( final int position : MathUtils.log10LinearRange(0, vc.getNSamples(), 0.1) ) {
+                    final VariantContextBuilder vcb = new VariantContextBuilder(vc);
+                    final List genotypes = new ArrayList(vc.getGenotypes());
+                    Collections.rotate(genotypes, position);
+                    vcb.genotypes(genotypes);
+
+                    timer.start();
+                    final AFCalcResult resultTracker = calc.getLog10PNonRef(vcb.make(), priors);
+                    final long runtime = timer.getElapsedTimeNano();
+
+                    final List columns = new LinkedList(coreValues);
+                    columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, position));
+                    report.addRowList(columns);
+                }
+            }
+        }
+    }
+
+    private static class AnalyzeByNonInformative extends Analysis {
+        public AnalyzeByNonInformative(final List columns) {
+            super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative"));
+        }
+
+        public void run(final AFCalcTestBuilder testBuilder, final List coreValues) {
+            final SimpleTimer timer = new SimpleTimer();
+
+            for ( final int nonTypePL : Arrays.asList(100) ) {
+                final AFCalc calc = testBuilder.makeModel();
+                final double[] priors = testBuilder.makePriors();
+
+                final int[] ac = new int[testBuilder.numAltAlleles];
+                ac[0] = 1;
+
+                for ( int nNonInformative = 0; nNonInformative < testBuilder.nSamples; nNonInformative++ ) {
+                    final VariantContext vc = testBuilder.makeACTest(ac, nNonInformative, nonTypePL);
+
+                    timer.start();
+                    final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
+                    final long runtime = timer.getElapsedTimeNano();
+
+                    final List columns = new LinkedList(coreValues);
+                    columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, nNonInformative));
+                    report.addRowList(columns);
+                }
+            }
+        }
+    }
+
+    private static class ModelParams {
+        final AFCalcFactory.Calculation modelType;
+        final int maxBiNSamples, maxTriNSamples;
+
+        private ModelParams(AFCalcFactory.Calculation modelType, int maxBiNSamples, int maxTriNSamples) {
+            this.modelType = modelType;
+            this.maxBiNSamples = maxBiNSamples;
+            this.maxTriNSamples = maxTriNSamples;
+        }
+
+        public boolean meetsConstraints(final int nAltAlleles, final int nSamples) {
+            if ( nAltAlleles == 1 )
+                return nSamples <= maxBiNSamples;
+            else if ( nAltAlleles == 2 )
+                return nSamples <= maxTriNSamples;
+            else
+                throw new IllegalStateException("Unexpected number of alt alleles " + nAltAlleles);
+        }
+    }
+
+    public enum Operation {
+        ANALYZE,
+        SINGLE,
+        EXACT_LOG
+    }
+    public static void main(final String[] args) throws Exception {
+        final TTCCLayout layout = new TTCCLayout();
+        layout.setThreadPrinting(false);
+        layout.setCategoryPrefixing(false);
+        layout.setContextPrinting(false);
+        logger.addAppender(new ConsoleAppender(layout));
+
+        final Operation op = Operation.valueOf(args[0]);
+
+        switch ( op ) {
+            case ANALYZE: analyze(args); break;
+            case SINGLE: profileBig(args); break;
+            case EXACT_LOG: exactLog(args); break;
+            default: throw new IllegalAccessException("unknown operation " + op);
+        }
+    }
+
+    private static void exactLog(final String[] args) throws Exception {
+        final File ref = new File(args[1]);
+        final File exactLogFile = new File(args[2]);
+        final List startsToUse = new LinkedList();
+
+        for ( int i = 3; i < args.length; i++ )
+            startsToUse.add(Integer.valueOf(args[i]));
+
+        final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(ref);
+        final GenomeLocParser parser = new GenomeLocParser(seq);
+        final BufferedReader reader = new BufferedReader(new FileReader(exactLogFile));
+        final List loggedCalls = ExactCallLogger.readExactLog(reader, startsToUse, parser);
+
+        for ( final ExactCallLogger.ExactCall call : loggedCalls ) {
+            final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(call.vc.getNSamples(), 1,
+                    AFCalcFactory.Calculation.EXACT_INDEPENDENT,
+                    AFCalcTestBuilder.PriorType.human);
+            logger.info(call);
+            final SimpleTimer timer = new SimpleTimer().start();
+            final AFCalcResult result = testBuilder.makeModel().getLog10PNonRef(call.vc, testBuilder.makePriors());
+            final long newNanoTime = timer.getElapsedTimeNano();
+            if ( call.originalCall.anyPolymorphic(-1) || result.anyPolymorphic(-1) ) {
+                logger.info("**** ONE IS POLY");
+            }
+            logger.info("\t\t getLog10PosteriorOfAFGT0: " + call.originalCall.getLog10PosteriorOfAFGT0() + " vs " + result.getLog10PosteriorOfAFGT0());
+            final double speedup = call.runtime / (1.0 * newNanoTime);
+            logger.info("\t\t runtime:                  " + call.runtime + " vs " + newNanoTime + " speedup " + String.format("%.2f", speedup) + "x");
+            for ( final Allele a : call.originalCall.getAllelesUsedInGenotyping() ) {
+                if ( a.isNonReference() ) {
+                    final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : "";
+                    logger.info("\t\t   MLE       " + a + ":            " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE);
+                    final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : "";
+                    logger.info("\t\t   Posterior " + a + ":            " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost);
+                }
+            }
+        }
+    }
+
+    private static void profileBig(final String[] args) throws Exception {
+        final int nSamples = Integer.valueOf(args[1]);
+        final int ac = Integer.valueOf(args[2]);
+
+        final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(nSamples, 1,
+                AFCalcFactory.Calculation.EXACT_INDEPENDENT,
+                AFCalcTestBuilder.PriorType.human);
+
+        final VariantContext vc = testBuilder.makeACTest(new int[]{ac}, 0, 100);
+
+        final SimpleTimer timer = new SimpleTimer().start();
+        final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vc, testBuilder.makePriors());
+        final long runtime = timer.getElapsedTimeNano();
+        logger.info("result " + resultTracker.getLog10PosteriorOfAFGT0());
+        logger.info("runtime " + runtime);
+    }
+
+    private static void analyze(final String[] args) throws Exception {
+        final List coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples",
+                "exact.model", "prior.type", "runtime", "n.evaluations");
+
+        final PrintStream out = new PrintStream(new FileOutputStream(args[1]));
+
+        final List modelParams = Arrays.asList(
+                new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10),
+//                new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10),
+                new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000));
+
+        final boolean ONLY_HUMAN_PRIORS = false;
+        final List priorTypes = ONLY_HUMAN_PRIORS
+                ? Arrays.asList(AFCalcTestBuilder.PriorType.values())
+                : Arrays.asList(AFCalcTestBuilder.PriorType.human);
+
+        final List analyzes = new ArrayList();
+        analyzes.add(new AnalyzeByACAndPL(coreColumns));
+        analyzes.add(new AnalyzeBySingletonPosition(coreColumns));
+        //analyzes.add(new AnalyzeByNonInformative(coreColumns));
+
+        for ( int iteration = 0; iteration < 1; iteration++ ) {
+            for ( final int nAltAlleles : Arrays.asList(1, 2) ) {
+                for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
+                    for ( final ModelParams modelToRun : modelParams) {
+                        if ( modelToRun.meetsConstraints(nAltAlleles, nSamples) ) {
+                            for ( final AFCalcTestBuilder.PriorType priorType : priorTypes ) {
+                                final AFCalcTestBuilder testBuilder
+                                        = new AFCalcTestBuilder(nSamples, nAltAlleles, modelToRun.modelType, priorType);
+
+                                for ( final Analysis analysis : analyzes ) {
+                                    logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType, analysis.getName())));
+                                    final List values = Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType);
+                                    analysis.run(testBuilder, (List)values);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        final GATKReport report = new GATKReport();
+        for ( final Analysis analysis : analyzes )
+            report.addTable(analysis.getTable());
+        report.print(out);
+        out.close();
+    }
+}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java
new file mode 100644
index 000000000..6f3740ab3
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcTestBuilder.java
@@ -0,0 +1,174 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
+
+import org.apache.commons.lang.ArrayUtils;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.variantcontext.*;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+public class AFCalcTestBuilder {
+    final static Allele A = Allele.create("A", true);
+    final static Allele C = Allele.create("C");
+    final static Allele G = Allele.create("G");
+    final static Allele T = Allele.create("T");
+    final static Allele AA = Allele.create("AA");
+    final static Allele AT = Allele.create("AT");
+    final static Allele AG = Allele.create("AG");
+
+    static int sampleNameCounter = 0;
+
+    final int nSamples;
+    final int numAltAlleles;
+    final AFCalcFactory.Calculation modelType;
+    final PriorType priorType;
+
+    public AFCalcTestBuilder(final int nSamples, final int numAltAlleles,
+                             final AFCalcFactory.Calculation modelType, final PriorType priorType) {
+        this.nSamples = nSamples;
+        this.numAltAlleles = numAltAlleles;
+        this.modelType = modelType;
+        this.priorType = priorType;
+    }
+
+    @Override
+    public String toString() {
+        return String.format("AFCalcTestBuilder nSamples=%d nAlts=%d model=%s prior=%s", nSamples, numAltAlleles, modelType, priorType);
+    }
+
+    public enum PriorType {
+        flat,
+        human
+    }
+
+    public int getNumAltAlleles() {
+        return numAltAlleles;
+    }
+
+    public int getnSamples() {
+        return nSamples;
+    }
+
+    public AFCalc makeModel() {
+        return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), 2);
+    }
+
+    public double[] makePriors() {
+        final int nPriorValues = 2*nSamples+1;
+
+        switch ( priorType ) {
+            case flat:
+                return MathUtils.normalizeFromLog10(new double[nPriorValues], true);  // flat priors
+            case human:
+                final double[] humanPriors = new double[nPriorValues];
+                UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
+                return humanPriors;
+            default:
+                throw new RuntimeException("Unexpected type " + priorType);
+        }
+    }
+
+    public VariantContext makeACTest(final List ACs, final int nNonInformative, final int nonTypePL) {
+        return makeACTest(ArrayUtils.toPrimitive(ACs.toArray(new Integer[]{})), nNonInformative, nonTypePL);
+    }
+
+    public VariantContext makeACTest(final int[] ACs, final int nNonInformative, final int nonTypePL) {
+        final int nChrom = nSamples * 2;
+
+        final int[] nhet = new int[numAltAlleles];
+        final int[] nhomvar = new int[numAltAlleles];
+
+        for ( int i = 0; i < ACs.length; i++ ) {
+            final double p = ACs[i] / (1.0 * nChrom);
+            nhomvar[i] = (int)Math.floor((nSamples - nNonInformative) * p * p);
+            nhet[i] = ACs[i] - 2 * nhomvar[i];
+
+            if ( nhet[i] < 0 )
+                throw new IllegalStateException("Bug! nhet[i] < 0");
+        }
+
+        final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar);
+        if ( calcAC != MathUtils.sum(ACs) )
+            throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs));
+
+        return makeACTest(nhet, nhomvar, nNonInformative, nonTypePL);
+    }
+
+    public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nNonInformative, final int nonTypePL) {
+        List samples = new ArrayList(nSamples);
+
+        for ( int altI = 0; altI < nhet.length; altI++ ) {
+            for ( int i = 0; i < nhet[altI]; i++ )
+                samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1));
+            for ( int i = 0; i < nhomvar[altI]; i++ )
+                samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1));
+        }
+
+        final Genotype nonInformative = makeNonInformative();
+        samples.addAll(Collections.nCopies(nNonInformative, nonInformative));
+
+        final int nRef = Math.max((int) (nSamples - nNonInformative - MathUtils.sum(nhet) - MathUtils.sum(nhomvar)), 0);
+        samples.addAll(Collections.nCopies(nRef, makePL(GenotypeType.HOM_REF, nonTypePL, 0)));
+
+        samples = samples.subList(0, nSamples);
+
+        if ( samples.size() > nSamples )
+            throw new IllegalStateException("too many samples");
+
+        VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles());
+        vcb.genotypes(samples);
+        return vcb.make();
+    }
+
+    public List getAlleles() {
+        return Arrays.asList(A, C, G, T, AA, AT, AG).subList(0, numAltAlleles+1);
+    }
+
+    public List getAlleles(final GenotypeType type, final int altI) {
+        switch (type) {
+            case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0));
+            case HET:     return Arrays.asList(getAlleles().get(0), getAlleles().get(altI));
+            case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI));
+            default: throw new IllegalArgumentException("Unexpected type " + type);
+        }
+    }
+
+    public Genotype makePL(final List expectedGT, int ... pls) {
+        GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
+        gb.alleles(expectedGT);
+        gb.PL(pls);
+        return gb.make();
+    }
+
+    private int numPLs() {
+        return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2);
+    }
+
+    public Genotype makeNonInformative() {
+        final int[] nonInformativePLs = new int[GenotypeLikelihoods.numLikelihoods(numAltAlleles, 2)];
+        return makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), nonInformativePLs);
+    }
+
+    public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) {
+        GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
+        gb.alleles(getAlleles(type, altI));
+
+        final int[] pls = new int[numPLs()];
+        Arrays.fill(pls, nonTypePL);
+
+        int index = 0;
+        switch ( type ) {
+            case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break;
+            case HET:     index = GenotypeLikelihoods.calculatePLindex(0, altI); break;
+            case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break;
+        }
+        pls[index] = 0;
+        gb.PL(pls);
+
+        return gb.make();
+    }
+}
\ No newline at end of file
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java
similarity index 57%
rename from protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java
rename to protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java
index 78ab11eb1..b248c8759 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyExactAFCalculationModel.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyExactAFCalc.java
@@ -23,56 +23,52 @@
  * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
-package org.broadinstitute.sting.gatk.walkers.genotyper;
+package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
 
-import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.variantcontext.*;
 
-import java.io.PrintStream;
 import java.util.*;
 
-public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
+public class GeneralPloidyExactAFCalc extends ExactAFCalc {
     static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
-    final protected UnifiedArgumentCollection UAC;
 
     private final int ploidy;
     private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
     private final static boolean VERBOSE = false;
 
-    protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
-        super(UAC, N, logger, verboseWriter);
-        ploidy = UAC.samplePloidy;
-        this.UAC = UAC;
-
+    protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
+        super(nSamples, maxAltAlleles, ploidy);
+        this.ploidy = ploidy;
     }
 
-    public List getLog10PNonRef(final VariantContext vc,
-                                        final double[] log10AlleleFrequencyPriors,
-                                        final AlleleFrequencyCalculationResult result) {
-
-        GenotypesContext GLs = vc.getGenotypes();
-        List alleles = vc.getAlleles();
-
+    @Override
+    protected VariantContext reduceScope(VariantContext vc) {
         // don't try to genotype too many alternate alleles
-        if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
-            logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
+        if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) {
+            logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
 
-            alleles = new ArrayList(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
+            final List alleles = new ArrayList(getMaxAltAlleles() + 1);
             alleles.add(vc.getReference());
-            alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
+            alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy));
 
-
-            GLs = subsetAlleles(vc, alleles, false, ploidy);
+            VariantContextBuilder builder = new VariantContextBuilder(vc);
+            builder.alleles(alleles);
+            builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
+            return builder.make();
+        } else {
+            return vc;
         }
-
-        combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
-
-        return alleles;
     }
 
+    @Override
+    public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
+        combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors);
+        return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
+    }
 
     /**
      * Simple wrapper class to hold values of combined pool likelihoods.
@@ -94,8 +90,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
 
         public void add(ExactACset set) {
             alleleCountSetList.add(set);
-            conformationMap.put(set.ACcounts, set);
-            final double likelihood = set.log10Likelihoods[0];
+            conformationMap.put(set.getACcounts(), set);
+            final double likelihood = set.getLog10Likelihoods()[0];
 
             if (likelihood > maxLikelihood )
                 maxLikelihood = likelihood;
@@ -108,11 +104,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
         }
 
         public double getLikelihoodOfConformation(int[] ac) {
-            return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0];
+            return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0];
         }
 
         public double getGLOfACZero() {
-            return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list
+            return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list
         }
 
         public int getLength() {
@@ -129,6 +125,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
      * @return                            list of numAllelesToChoose most likely alleles
      */
 
+    private static final int PL_INDEX_OF_HOM_REF = 0;
     private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) {
         final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
         final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
@@ -136,7 +133,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
             likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
 
         // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
-        final ArrayList GLs = getGLs(vc.getGenotypes());
+        final ArrayList GLs = getGLs(vc.getGenotypes(), false);
         for ( final double[] likelihoods : GLs ) {
 
             final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
@@ -144,7 +141,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
             // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
             for (int k=1; k < acCount.length;k++) {
                 if (acCount[k] > 0)
-                    likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL];
+                    likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]);
 
             }
         }
@@ -171,15 +168,13 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
      * @param numAlleles                       Number of alternate alleles
      * @param ploidyPerPool                    Number of samples per pool
      * @param log10AlleleFrequencyPriors       Frequency priors
-     * @param result                           object to fill with output values
      */
-    protected static void combineSinglePools(final GenotypesContext GLs,
-                                             final int numAlleles,
-                                             final int ploidyPerPool,
-                                             final double[] log10AlleleFrequencyPriors,
-                                             final AlleleFrequencyCalculationResult result) {
+    protected void combineSinglePools(final GenotypesContext GLs,
+                                      final int numAlleles,
+                                      final int ploidyPerPool,
+                                      final double[] log10AlleleFrequencyPriors) {
 
-        final ArrayList genotypeLikelihoods = getGLs(GLs);
+        final ArrayList genotypeLikelihoods = getGLs(GLs, true);
 
 
         int combinedPloidy = 0;
@@ -190,23 +185,30 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
         // first element: zero ploidy, e.g. trivial degenerate distribution
         final int[] zeroCounts = new int[numAlleles];
         final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
-        set.log10Likelihoods[0] = 0.0;
+        set.getLog10Likelihoods()[0] = 0.0;
 
         combinedPoolLikelihoods.add(set);
-        for (int p=1; p ACqueue = new LinkedList();
         // mapping of ExactACset indexes to the objects
         final HashMap indexesToACset = new HashMap();
@@ -220,19 +222,19 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
         ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
 
         ACqueue.add(zeroSet);
-        indexesToACset.put(zeroSet.ACcounts, zeroSet);
+        indexesToACset.put(zeroSet.getACcounts(), zeroSet);
 
         // keep processing while we have AC conformations that need to be calculated
-        double maxLog10L = Double.NEGATIVE_INFINITY;
         while ( !ACqueue.isEmpty() ) {
+            getStateTracker().incNEvaluations();
             // compute log10Likelihoods
             final ExactACset ACset = ACqueue.remove();
-            final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLog10L, ACqueue, indexesToACset);
-            maxLog10L = Math.max(maxLog10L, log10LofKs);
+            final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset);
+
             // clean up memory
-            indexesToACset.remove(ACset.ACcounts);
+            indexesToACset.remove(ACset.getACcounts());
             if ( VERBOSE )
-                System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
+                System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
 
         }
         return newPool;
@@ -248,51 +250,46 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
      * @param log10AlleleFrequencyPriors Prior object
      * @param originalPloidy             Total ploidy of original combined pool
      * @param newGLPloidy                Ploidy of GL vector
-     * @param result                     AFResult object
-     * @param maxLog10L                  max likelihood observed so far
      * @param ACqueue                    Queue of conformations to compute
      * @param indexesToACset             AC indices of objects in queue
      * @return                           max log likelihood
      */
-    private static double calculateACConformationAndUpdateQueue(final ExactACset set,
-                                                                final CombinedPoolLikelihoods newPool,
-                                                                final CombinedPoolLikelihoods originalPool,
-                                                                final double[] newGL,
-                                                                final double[] log10AlleleFrequencyPriors,
-                                                                final int originalPloidy,
-                                                                final int newGLPloidy,
-                                                                final AlleleFrequencyCalculationResult result,
-                                                                final double  maxLog10L,
-                                                                final LinkedList ACqueue,
-                                                                final HashMap indexesToACset) {
+    private double calculateACConformationAndUpdateQueue(final ExactACset set,
+                                                         final CombinedPoolLikelihoods newPool,
+                                                         final CombinedPoolLikelihoods originalPool,
+                                                         final double[] newGL,
+                                                         final double[] log10AlleleFrequencyPriors,
+                                                         final int originalPloidy,
+                                                         final int newGLPloidy,
+                                                         final LinkedList ACqueue,
+                                                         final HashMap indexesToACset) {
 
         // compute likeihood in "set" of new set based on original likelihoods
-        final int numAlleles = set.ACcounts.counts.length;
+        final int numAlleles = set.getACcounts().getCounts().length;
         final int newPloidy = set.getACsum();
-        final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result);
+        final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy);
 
 
         // add to new pool
         if (!Double.isInfinite(log10LofK))
             newPool.add(set);
 
-        if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
-            if ( VERBOSE )
-                System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
+        // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
+        if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) {
             return log10LofK;
         }
 
         // iterate over higher frequencies if possible
         // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
         // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
-        final int ACwiggle = set.ACcounts.counts[0];
+        final int ACwiggle = set.getACcounts().getCounts()[0];
         if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
             return log10LofK;
 
 
         // add conformations for other cases
         for ( int allele = 1; allele < numAlleles; allele++ ) {
-            final int[] ACcountsClone = set.ACcounts.getCounts().clone();
+            final int[] ACcountsClone = set.getACcounts().getCounts().clone();
             ACcountsClone[allele]++;
             // is this a valid conformation?
             int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
@@ -309,67 +306,67 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
     }
 
 
-    /**
-     * Naive combiner of two multiallelic pools - number of alt alleles must be the same.
-     * Math is generalization of biallelic combiner.
-     *
-     * For vector K representing an allele count conformation,
-     * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
-     * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
-     * @param originalPool                    First log-likelihood pool GL vector
-     * @param yy                    Second pool GL vector
-     * @param ploidy1               Ploidy of first pool (# of chromosomes in it)
-     * @param ploidy2               Ploidy of second pool
-     * @param numAlleles            Number of alleles
-     * @param log10AlleleFrequencyPriors Array of biallelic priors
-     * @param result                Af calculation result object                  
-     */
-    public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
-                                                      final double[] log10AlleleFrequencyPriors,
-                                                      final AlleleFrequencyCalculationResult result) {
-/*
-        final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
-        final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
-
-        if (dim1 != originalPool.getLength() || dim2 != yy.length)
-            throw new ReviewedStingException("BUG: Inconsistent vector length");
-
-        if (ploidy2 == 0)
-            return;
-
-        final int newPloidy = ploidy1 + ploidy2;
-
-        // Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
-        // and L2(K) = Pr(D|AC2=K) * choose(m2,K)
-        GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
-        final double[] x = originalPool.getLikelihoodsAsVector(true);
-        while(firstIterator.hasNext()) {
-            x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
-            firstIterator.next();
-        }
-
-        GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
-        final double[] y = yy.clone();
-        while(secondIterator.hasNext()) {
-            y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
-            secondIterator.next();
-        }
-
-        // initialize output to -log10(choose(m1+m2,[k1 k2...])
-        final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
-        final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
-
-
-        // Now, result(K) =  logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
-        while(outputIterator.hasNext()) {
-            final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
-            double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
-
-            originalPool.add(likelihood, set, outputIterator.getLinearIndex());
-            outputIterator.next();
-        }
-*/
-    }
+//    /**
+//     * Naive combiner of two multiallelic pools - number of alt alleles must be the same.
+//     * Math is generalization of biallelic combiner.
+//     *
+//     * For vector K representing an allele count conformation,
+//     * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
+//     * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
+//     * @param originalPool                    First log-likelihood pool GL vector
+//     * @param yy                    Second pool GL vector
+//     * @param ploidy1               Ploidy of first pool (# of chromosomes in it)
+//     * @param ploidy2               Ploidy of second pool
+//     * @param numAlleles            Number of alleles
+//     * @param log10AlleleFrequencyPriors Array of biallelic priors
+//     * @param resultTracker                Af calculation result object
+//     */
+//    public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
+//                                                      final double[] log10AlleleFrequencyPriors,
+//                                                      final AFCalcResultTracker resultTracker) {
+///*
+//        final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
+//        final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
+//
+//        if (dim1 != originalPool.getLength() || dim2 != yy.length)
+//            throw new ReviewedStingException("BUG: Inconsistent vector length");
+//
+//        if (ploidy2 == 0)
+//            return;
+//
+//        final int newPloidy = ploidy1 + ploidy2;
+//
+//        // Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
+//        // and L2(K) = Pr(D|AC2=K) * choose(m2,K)
+//        GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
+//        final double[] x = originalPool.getLikelihoodsAsVector(true);
+//        while(firstIterator.hasNext()) {
+//            x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
+//            firstIterator.next();
+//        }
+//
+//        GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
+//        final double[] y = yy.clone();
+//        while(secondIterator.hasNext()) {
+//            y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
+//            secondIterator.next();
+//        }
+//
+//        // initialize output to -log10(choose(m1+m2,[k1 k2...])
+//        final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
+//        final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
+//
+//
+//        // Now, result(K) =  logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
+//        while(outputIterator.hasNext()) {
+//            final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
+//            double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
+//
+//            originalPool.add(likelihood, set, outputIterator.getLinearIndex());
+//            outputIterator.next();
+//        }
+//*/
+//    }
 
     /**
      * Compute likelihood of a particular AC conformation and update AFresult object
@@ -380,15 +377,13 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
      * @param numAlleles                Number of alleles (including ref)
      * @param ploidy1                   Ploidy of original pool (combined)
      * @param ploidy2                   Ploidy of new pool
-     * @param result                    AFResult object
      * @return                          log-likehood of requested conformation
      */
-    private static double computeLofK(final ExactACset set,
-                                      final CombinedPoolLikelihoods firstGLs,
-                                      final double[] secondGL,
-                                      final double[] log10AlleleFrequencyPriors,
-                                      final int numAlleles, final int ploidy1, final int ploidy2,
-                                      final AlleleFrequencyCalculationResult result) {
+    private double computeLofK(final ExactACset set,
+                               final CombinedPoolLikelihoods firstGLs,
+                               final double[] secondGL,
+                               final double[] log10AlleleFrequencyPriors,
+                               final int numAlleles, final int ploidy1, final int ploidy2) {
 
         final int newPloidy = ploidy1 + ploidy2;
 
@@ -397,17 +392,18 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
         if (newPloidy != totalAltK)
             throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
 
-        totalAltK -= set.ACcounts.counts[0];
+        totalAltK -= set.getACcounts().getCounts()[0];
         // totalAltK has sum of alt alleles of conformation now
 
 
         // special case for k = 0 over all k
         if ( totalAltK == 0 ) {   // all-ref case
             final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
-            set.log10Likelihoods[0] = log10Lof0;
+            set.getLog10Likelihoods()[0] = log10Lof0;
 
-            result.setLog10LikelihoodOfAFzero(log10Lof0);
-            result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
+            getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
+            getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
+            return log10Lof0;
 
         }   else {
 
@@ -415,12 +411,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
             // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
             // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
 
-            int[] currentCount = set.ACcounts.getCounts();
+            int[] currentCount = set.getACcounts().getCounts();
             double denom =  -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
 
             // for current conformation, get all possible ways to break vector K into two components G1 and G2
             final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
-            set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
+            set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY;
             while (innerIterator.hasNext()) {
                 // check if breaking current conformation into g1 and g2 is feasible.
                 final int[] acCount2 = innerIterator.getCurrentVector();
@@ -436,27 +432,29 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
                         final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
                         final double sum = firstGL + gl2 + num1 + num2;
 
-                        set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum);
+                        set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum);
                     }
                 }
                 innerIterator.next();
             }
 
-            set.log10Likelihoods[0] += denom;
+            set.getLog10Likelihoods()[0] += denom;
         }
 
-        double log10LofK = set.log10Likelihoods[0];
+        double log10LofK = set.getLog10Likelihoods()[0];
 
         // update the MLE if necessary
-        final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length);
-        result.updateMLEifNeeded(log10LofK, altCounts);
+        final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
+        // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
+        getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
 
         // apply the priors over each alternate allele
         for (final int ACcount : altCounts ) {
             if ( ACcount > 0 )
                 log10LofK += log10AlleleFrequencyPriors[ACcount];
         }
-        result.updateMAPifNeeded(log10LofK, altCounts);
+        // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
+        getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
 
         return log10LofK;
     }
@@ -479,99 +477,6 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
         return (sum == ploidy);
     }
 
-    /**
-     * Combines naively two biallelic pools (of arbitrary size).
-     * For two pools of size m1 and m2, we can compute the combined likelihood as:
-     *   Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
-     * @param originalPool              Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
-     * @param newPLVector               Second GL vector
-     * @param ploidy1               Ploidy of first pool (# of chromosomes in it)
-     * @param ploidy2               Ploidy of second pool
-     * @param log10AlleleFrequencyPriors Array of biallelic priors
-     * @param result                Af calculation result object
-     * @return                Combined likelihood vector
-     */
-    public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
-                                                                 final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
-                                                                 final AlleleFrequencyCalculationResult result) {
-
-        final int newPloidy = ploidy1 + ploidy2;
-
-        final double[] combinedLikelihoods = new double[1+newPloidy];
-
-        /** Pre-fill result array and incorporate weights into input vectors
-         * Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
-         * and L2(k) = Pr(D|AC2=k) * choose(m2,k)
-         * equation reduces to
-         * Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
-         * which is just plain convolution of L1 and L2 (with pre-existing vector)
-         */
-
-        // intialize result vector to -infinity
-        Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
-
-        final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
-        for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
-            x[k] = Double.NEGATIVE_INFINITY;
-
-        final double[] y = newPLVector.clone();
-
-
-        final double log10Lof0 = x[0]+y[0];
-        result.setLog10LikelihoodOfAFzero(log10Lof0);
-        result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
-
-        double maxElement = log10Lof0;
-        int maxElementIdx = 0;
-        int[] alleleCounts = new int[1];
-        for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
-            double[] acc = new double[k+1];
-            Arrays.fill(acc,Double.NEGATIVE_INFINITY);
-            double innerMax = Double.NEGATIVE_INFINITY;
-
-            for (int j=0; j <=k; j++) {
-                double x1,y1;
-
-
-                if (k-j>=0 && k-j < y.length)
-                    y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
-                else
-                    continue;
-
-                if (j < x.length)
-                    x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
-                else
-                    continue;
-
-                if (Double.isInfinite(x1) || Double.isInfinite(y1))
-                    continue;
-                acc[j] = x1 + y1;
-                if (acc[j] > innerMax)
-                    innerMax = acc[j];
-                else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
-                    break;
-            }
-            combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
-            maxElementIdx = k;
-            double maxDiff = combinedLikelihoods[k] - maxElement;
-            if (maxDiff > 0)
-                maxElement = combinedLikelihoods[k];
-            else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
-                break;
-            }
-
-            alleleCounts[0] = k;
-            result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
-            result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
-
-
-        }
-
-
-        return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
-    }
-
-
     /**
      * From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
      * including updating the PL's, and assign genotypes accordingly
@@ -614,7 +519,10 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
             // create the new likelihoods array from the alleles we are allowed to use
             final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
             double[] newLikelihoods;
-            if ( numOriginalAltAlleles == numNewAltAlleles) {
+
+            // Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization
+            // and subsetting
+            if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) {
                 newLikelihoods = originalLikelihoods;
             } else {
                 newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse);
@@ -657,10 +565,10 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
      *
      * @return genotype
      */
-    private static void assignGenotype(final GenotypeBuilder gb,
-                                       final double[] newLikelihoods,
-                                       final List allelesToUse,
-                                       final int numChromosomes) {
+    private void assignGenotype(final GenotypeBuilder gb,
+                                final double[] newLikelihoods,
+                                final List allelesToUse,
+                                final int numChromosomes) {
         final int numNewAltAlleles = allelesToUse.size() - 1;
 
 
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java
index 0890ac20c..287acafb3 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnEdge.java
@@ -2,6 +2,9 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
 
 import org.jgrapht.graph.DefaultDirectedGraph;
 
+import java.io.Serializable;
+import java.util.Comparator;
+
 /**
  * Created by IntelliJ IDEA.
  * User: ebanks
@@ -9,7 +12,7 @@ import org.jgrapht.graph.DefaultDirectedGraph;
  */
 
 // simple edge class for connecting nodes in the graph
-public class DeBruijnEdge implements Comparable {
+public class DeBruijnEdge {
 
     private int multiplicity;
     private boolean isRef;
@@ -53,8 +56,10 @@ public class DeBruijnEdge implements Comparable {
         return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
     }
 
-    @Override
-    public int compareTo( final DeBruijnEdge that ) {
-        return this.multiplicity - that.multiplicity;
+    public static class EdgeWeightComparator implements Comparator, Serializable {
+        @Override
+        public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) {
+            return edge1.multiplicity - edge2.multiplicity;
+        }
     }
 }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java
index 39833613d..4da3251bc 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/DeBruijnVertex.java
@@ -14,7 +14,7 @@ public class DeBruijnVertex {
     public final int kmer;
 
     public DeBruijnVertex( final byte[] sequence, final int kmer ) {
-        this.sequence = sequence;
+        this.sequence = sequence.clone();
         this.kmer = kmer;
     }
 
@@ -37,7 +37,7 @@ public class DeBruijnVertex {
     }
 
     public byte[] getSequence() {
-        return sequence;
+        return sequence.clone();
     }
 
     public byte[] getSuffix() {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
index 9de9b3292..d91df82e2 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java
@@ -52,7 +52,11 @@ public class GenotypingEngine {
         noCall.add(Allele.NO_CALL);
     }
 
-    // This function is the streamlined approach, currently not being used
+    // WARN
+    // This function is the streamlined approach, currently not being used by default
+    // WARN
+    // WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
+    // WARN
     @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
     public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
                                                                                                                              final ArrayList haplotypes,
@@ -184,6 +188,7 @@ public class GenotypingEngine {
         return returnCalls;
     }
 
+    // BUGBUG: Create a class to hold this complicated return type
     @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
     public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
                                                                                                                                final ArrayList haplotypes,
@@ -210,14 +215,9 @@ public class GenotypingEngine {
                 System.out.println( ">> Events = " + h.getEventMap());
             }
         }
-        // Create the VC merge priority list
-        final ArrayList priorityList = new ArrayList();
-        for( int iii = 0; iii < haplotypes.size(); iii++ ) {
-            priorityList.add("HC" + iii);
-        }
 
-        cleanUpSymbolicUnassembledEvents( haplotypes, priorityList );
-        if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure
+        cleanUpSymbolicUnassembledEvents( haplotypes );
+        if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
             mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
         }
         if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
@@ -229,13 +229,16 @@ public class GenotypingEngine {
         // Walk along each position in the key set and create each event to be outputted
         for( final int loc : startPosKeySet ) {
             if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
-                final ArrayList eventsAtThisLoc = new ArrayList();
+                final ArrayList eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view
+                final ArrayList priorityList = new ArrayList(); // used to merge overlapping events into common reference view
+
                 if( activeAllelesToGenotype.isEmpty() ) {
                     for( final Haplotype h : haplotypes ) {
                         final HashMap eventMap = h.getEventMap();
                         final VariantContext vc = eventMap.get(loc);
                         if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
                             eventsAtThisLoc.add(vc);
+                            priorityList.add(vc.getSource());
                         }
                     }
                 } else { // we are in GGA mode!
@@ -260,11 +263,27 @@ public class GenotypingEngine {
                 // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
                 final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
 
+                // Sanity check the priority list
+                for( final VariantContext vc : eventsAtThisLoc ) {
+                    if( !priorityList.contains(vc.getSource()) ) {
+                        throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles.");
+                    }
+                }
+                for( final String name : priorityList ) {
+                    boolean found = false;
+                    for( final VariantContext vc : eventsAtThisLoc ) {
+                        if(vc.getSource().equals(name)) { found = true; break; }
+                    }
+                    if( !found ) {
+                        throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles.");
+                    }
+                }
+
                 // Merge the event to find a common reference representation
                 final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
                 if( mergedVC == null ) { continue; }
 
-                final HashMap> alleleHashMap = new HashMap>();
+                HashMap> alleleHashMap = new HashMap>();
                 int aCount = 0;
                 for( final Allele a : mergedVC.getAlleles() ) {
                     alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
@@ -289,9 +308,20 @@ public class GenotypingEngine {
                     }
                     genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
                 }
-                final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
-
+                VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
                 if( call != null ) {
+                    if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
+                        final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
+                        // also, need to update the allele -> haplotype mapping
+                        final HashMap> alleleHashMapTrim = new HashMap>();
+                        for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
+                            alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
+                        }
+
+                        call = vcCallTrim;
+                        alleleHashMap = alleleHashMapTrim;
+                    }
+
                     returnCalls.add( new Pair>>(call, alleleHashMap) );
                 }
             }
@@ -299,9 +329,8 @@ public class GenotypingEngine {
         return returnCalls;
     }
 
-    protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes, final ArrayList priorityList ) {
+    protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes ) {
         final ArrayList haplotypesToRemove = new ArrayList();
-        final ArrayList stringsToRemove = new ArrayList();
         for( final Haplotype h : haplotypes ) {
             for( final VariantContext vc : h.getEventMap().values() ) {
                 if( vc.isSymbolic() ) {
@@ -309,7 +338,6 @@ public class GenotypingEngine {
                         for( final VariantContext vc2 : h2.getEventMap().values() ) {
                             if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) {
                                 haplotypesToRemove.add(h);
-                                stringsToRemove.add(vc.getSource());
                                 break;
                             }
                         }
@@ -318,7 +346,6 @@ public class GenotypingEngine {
             }
         }
         haplotypes.removeAll(haplotypesToRemove);
-        priorityList.removeAll(stringsToRemove);
     }
 
     protected void mergeConsecutiveEventsBasedOnLD( final ArrayList haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
index 8079c2e1f..5aba23faa 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java
@@ -27,29 +27,23 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
 
 import com.google.java.contract.Ensures;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
-import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
-import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
-import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
 import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.CommandLineGATK;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
+import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.filters.BadMateFilter;
+import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
-import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
-import org.broadinstitute.sting.gatk.walkers.PartitionBy;
-import org.broadinstitute.sting.gatk.walkers.PartitionType;
+import org.broadinstitute.sting.gatk.walkers.*;
 import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
 import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
-import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
-import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
-import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
-import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
+import org.broadinstitute.sting.gatk.walkers.genotyper.*;
 import org.broadinstitute.sting.utils.*;
+import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
 import org.broadinstitute.sting.utils.clipping.ReadClipper;
 import org.broadinstitute.sting.utils.codecs.vcf.*;
 import org.broadinstitute.sting.utils.collections.Pair;
@@ -57,6 +51,9 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
 import org.broadinstitute.sting.utils.fragments.FragmentCollection;
 import org.broadinstitute.sting.utils.fragments.FragmentUtils;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
+import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
+import org.broadinstitute.sting.utils.pairhmm.PairHMM;
 import org.broadinstitute.sting.utils.pileup.PileupElement;
 import org.broadinstitute.sting.utils.sam.AlignmentUtils;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@@ -106,6 +103,7 @@ import java.util.*;
 
 @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} )
 @PartitionBy(PartitionType.LOCUS)
+@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
 @ActiveRegionExtension(extension=65, maxRegion=300)
 public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible {
 
@@ -118,6 +116,12 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
     @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
     protected PrintStream graphWriter = null;
 
+    /**
+     * The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
+     */
+    @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
+    public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
+
     @Hidden
     @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
     protected String keepRG = null;
@@ -177,7 +181,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
      * so annotations will be excluded even if they are explicitly included with the other options.
      */
     @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
-    protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"HaplotypeScore", "MappingQualityZero", "SpanningDeletions", "TandemRepeatAnnotator"}));
+    protected List annotationsToExclude = new ArrayList(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));
 
     /**
      * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
@@ -238,12 +242,16 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
         samplesList.addAll( samples );
         // initialize the UnifiedGenotyper Engine which is used to call into the exact model
         final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
-        UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
-        UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
-        UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
-        UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING);
-        UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
-        UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
+        UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
+
+        // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
+        UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
+        simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
+        simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
+        simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
+        simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
+        simpleUAC.exactCallsLog = null;
+        UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
 
         // initialize the output VCF header
         annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
@@ -287,7 +295,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
         }
 
         assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
-        likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false );
+        likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
         genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
     }
 
@@ -312,7 +320,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
         if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
             for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
                 if( !allelesToGenotype.contains(vc) ) {
-                    allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object
+                    allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object
                 }
             }
             if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
@@ -400,6 +408,9 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
         final List filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
         if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
 
+        // sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
+        Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
+
         // evaluate each sample's reads against all haplotypes
         final HashMap> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
         final HashMap> perSampleFilteredReadList = splitReadsBySample( filteredReads );
@@ -414,7 +425,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem
                   : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
             if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
 
-            final Map>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
+            final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
             final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
             final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes());
 
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java
index 0ef1a13a4..f7575439b 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/KBestPaths.java
@@ -4,6 +4,7 @@ import org.apache.commons.lang.ArrayUtils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.jgrapht.graph.DefaultDirectedGraph;
 
+import java.io.Serializable;
 import java.util.*;
 
 /**
@@ -76,13 +77,15 @@ public class KBestPaths {
         }
     }
 
-    protected static class PathComparatorTotalScore implements Comparator {
+    protected static class PathComparatorTotalScore implements Comparator, Serializable {
+        @Override
         public int compare(final Path path1, final Path path2) {
             return path1.totalScore - path2.totalScore;
         }
     }
 
-    //protected static class PathComparatorLowestEdge implements Comparator {
+    //protected static class PathComparatorLowestEdge implements Comparator, Serializable {
+    //    @Override
     //    public int compare(final Path path1, final Path path2) {
     //        return path2.lowestEdge - path1.lowestEdge;
     //    }
@@ -124,7 +127,7 @@ public class KBestPaths {
             // recursively run DFS
             final ArrayList edgeArrayList = new ArrayList();
             edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex));
-            Collections.sort(edgeArrayList);
+            Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator());
             Collections.reverse(edgeArrayList);
             for ( final DeBruijnEdge edge : edgeArrayList ) {
                 // make sure the edge is not already in the path
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
index b5ce4b4bc..a0924623b 100644
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java
@@ -27,25 +27,46 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
 
 import com.google.java.contract.Ensures;
 import com.google.java.contract.Requires;
+import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
 import org.broadinstitute.sting.utils.*;
 import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.pairhmm.*;
 import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 import org.broadinstitute.sting.utils.sam.ReadUtils;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 
+import java.io.PrintStream;
 import java.util.*;
 
 public class LikelihoodCalculationEngine {
 
     private static final double LOG_ONE_HALF = -Math.log10(2.0);
-    private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
     private final byte constantGCP;
     private final boolean DEBUG;
     private final PairHMM pairHMM;
 
-    public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) {
-        pairHMM = new PairHMM( noBanded );
+    public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
+
+        switch (hmmType) {
+            case EXACT:
+                pairHMM = new ExactPairHMM();
+                break;
+            case ORIGINAL:
+                pairHMM = new OriginalPairHMM();
+                break;
+            case CACHING:
+                pairHMM = new CachingPairHMM();
+                break;
+            case LOGLESS_CACHING:
+                pairHMM = new LoglessCachingPairHMM();
+                break;
+            default:
+                throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
+        }
+
         this.constantGCP = constantGCP;
         DEBUG = debug;
     }
@@ -69,23 +90,18 @@ public class LikelihoodCalculationEngine {
         X_METRIC_LENGTH += 2;
         Y_METRIC_LENGTH += 2;
 
-        // initial arrays to hold the probabilities of being in the match, insertion and deletion cases
-        final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
-        final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
-        final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
-
-        PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
+        // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
+        pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
 
         // for each sample's reads
-        for( final String sample : perSampleReadList.keySet() ) {
+        for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) {
             //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
             // evaluate the likelihood of the reads given those haplotypes
-            computeReadLikelihoods( haplotypes, perSampleReadList.get(sample), sample, matchMetricArray, XMetricArray, YMetricArray );
+            computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() );
         }
     }
 
-    private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample,
-                                         final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
+    private void computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads, final String sample ) {
 
         final int numHaplotypes = haplotypes.size();
         final int numReads = reads.size();
@@ -113,9 +129,8 @@ public class LikelihoodCalculationEngine {
                 final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
                 previousHaplotypeSeen = haplotype;
 
-                readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
-                        readQuals, readInsQuals, readDelQuals, overallGCP,
-                        haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
+                readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
+                        readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0);
                 readCounts[jjj][iii] = readCount;
             }
         }
@@ -125,12 +140,12 @@ public class LikelihoodCalculationEngine {
     }
 
     private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
-        for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){
+        for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ) {
             if( b1[iii] != b2[iii] ) {
                 return iii;
             }
         }
-        return b1.length;
+        return Math.min(b1.length, b2.length);
     }
 
     @Requires({"haplotypes.size() > 0"})
@@ -183,7 +198,7 @@ public class LikelihoodCalculationEngine {
                                 haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
                             }
                         }
-                        haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
+                        haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
                     }
                 }       
             }
@@ -280,7 +295,7 @@ public class LikelihoodCalculationEngine {
         final int numHaplotypes = haplotypes.size();
         final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
         final ArrayList bestHaplotypesIndexList = new ArrayList();
-        bestHaplotypesIndexList.add(0); // always start with the reference haplotype
+        bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
         // set up the default 1-to-1 haplotype mapping object
         final ArrayList> haplotypeMapping = new ArrayList>();
         for( final Haplotype h : haplotypes ) {
@@ -322,19 +337,30 @@ public class LikelihoodCalculationEngine {
         return bestHaplotypes;
     }
 
-    public static Map>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, final Pair>> call) {
-        final Map>> returnMap = new HashMap>>();
+    public static int findReferenceIndex( final List haplotypes ) {
+        for( final Haplotype h : haplotypes ) {
+            if( h.isReference() ) { return haplotypes.indexOf(h); }
+        }
+        throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
+    }
+
+    public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
+                                                                                            final HashMap> perSampleReadList,
+                                                                                            final HashMap> perSampleFilteredReadList,
+                                                                                            final Pair>> call,
+                                                                                            final double downsamplingFraction,
+                                                                                            final PrintStream downsamplingLog ) {
+        final Map returnMap = new HashMap();
         final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
         for( final Map.Entry> sample : perSampleReadList.entrySet() ) {
-            final Map> alleleReadMap = new HashMap>();
+            final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
+
             final ArrayList readsForThisSample = sample.getValue();
             for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
                 final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
                 // only count the read if it overlaps the event, otherwise it is not added to the output read list at all
                 if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
-                    final double likelihoods[] = new double[call.getFirst().getAlleles().size()];
-                    int count = 0;
-                    for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood
+                    for( final Allele a : call.getFirst().getAlleles() ) {
                         double maxLikelihood = Double.NEGATIVE_INFINITY;
                         for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
                             final double likelihood = h.getReadLikelihoods(sample.getKey())[iii];
@@ -342,43 +368,26 @@ public class LikelihoodCalculationEngine {
                                 maxLikelihood = likelihood;
                             }
                         }
-                        likelihoods[count++] = maxLikelihood;
+                        likelihoodMap.add(read, a, maxLikelihood);
                     }
-                    final int bestAllele = MathUtils.maxElementIndex(likelihoods);
-                    final double bestLikelihood = likelihoods[bestAllele];
-                    Allele allele = Allele.NO_CALL;
-                    boolean isInformativeRead = false;
-                    for( final double likelihood : likelihoods ) {
-                        if( bestLikelihood - likelihood > BEST_LIKELIHOOD_THRESHOLD ) {
-                            isInformativeRead = true;
-                            break;
-                        }
-                    }
-                    // uninformative reads get the no call Allele
-                    if( isInformativeRead ) {
-                        allele = call.getFirst().getAlleles().get(bestAllele);
-                    }
-                    List readList = alleleReadMap.get(allele);
-                    if( readList == null ) {
-                        readList = new ArrayList();
-                        alleleReadMap.put(allele, readList);
-                    }
-                    readList.add(read);
                 }
             }
+
+            // down-sample before adding filtered reads
+            likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
+
             // add all filtered reads to the NO_CALL list because they weren't given any likelihoods
-            List readList = alleleReadMap.get(Allele.NO_CALL);
-            if( readList == null ) {
-                readList = new ArrayList();
-                alleleReadMap.put(Allele.NO_CALL, readList);
-            }
             for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
                 // only count the read if it overlaps the event, otherwise it is not added to the output read list at all
                 if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
-                    readList.add(read);
+                    for( final Allele a : call.getFirst().getAlleles() ) {
+                        likelihoodMap.add(read, a, 0.0);
+                    }
                 }
             }
-            returnMap.put(sample.getKey(), alleleReadMap);
+
+            returnMap.put(sample.getKey(), likelihoodMap);
+
         }
         return returnMap;
     }
diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
index 93fd36a22..fd46e4e69 100755
--- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
+++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java
@@ -184,7 +184,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
         for( final GATKSAMRecord read : reads ) {
             final byte[] sequence = read.getReadBases();
             final byte[] qualities = read.getBaseQualities();
-            final byte[] reducedReadCounts = read.getReducedReadCounts();  // will be null if read is not readuced
+            final byte[] reducedReadCounts = read.getReducedReadCounts();  // will be null if read is not reduced
             if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
                 final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
                 for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {                    
@@ -201,7 +201,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
                         // compute mean number of reduced read counts in current kmer span
                         final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1);
                         // precise rounding can make a difference with low consensus counts
-                        countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length);
+                        countNumber = MathUtils.arrayMax(counts);
+                      //  countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length);
                     }
 
                     if( !badKmer ) {
diff --git a/protected/java/src/org/broadinstitute/sting/utils/genotyper/AdvancedPerReadAlleleLikelihoodMap.java b/protected/java/src/org/broadinstitute/sting/utils/genotyper/AdvancedPerReadAlleleLikelihoodMap.java
new file mode 100644
index 000000000..77a7c3bd9
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/utils/genotyper/AdvancedPerReadAlleleLikelihoodMap.java
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2011 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+package org.broadinstitute.sting.utils.genotyper;
+
+
+import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
+import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+
+import java.io.PrintStream;
+import java.util.*;
+
+public class AdvancedPerReadAlleleLikelihoodMap extends StandardPerReadAlleleLikelihoodMap implements ProtectedPackageSource {
+
+    public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
+        return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log);
+    }
+
+    public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {
+        // special case removal of all or no reads
+        if ( downsamplingFraction <= 0.0 )
+            return;
+        if ( downsamplingFraction >= 1.0 ) {
+            likelihoodReadMap.clear();
+            return;
+        }
+
+        // start by stratifying the reads by the alleles they represent at this position
+        final Map> alleleReadMap = new HashMap>(alleles.size());
+        for ( Allele allele : alleles )
+            alleleReadMap.put(allele, new ArrayList());
+
+        for ( Map.Entry> entry : likelihoodReadMap.entrySet() ) {
+            // do not remove reduced reads!
+            if ( !entry.getKey().isReducedRead() ) {
+                final Allele bestAllele = getMostLikelyAllele(entry.getValue());
+                if ( bestAllele != Allele.NO_CALL )
+                    alleleReadMap.get(bestAllele).add(entry.getKey());
+            }
+        }
+
+        // compute the reads to remove and actually remove them
+        final List readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log);
+        for ( final GATKSAMRecord read : readsToRemove )
+            likelihoodReadMap.remove(read);
+    }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java
new file mode 100644
index 000000000..282db45d5
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/CachingPairHMM.java
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.pairhmm;
+
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.QualityUtils;
+
+import java.util.Arrays;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: rpoplin, carneiro
+ * Date: 10/16/12
+ */
+
+public class CachingPairHMM extends OriginalPairHMM {
+
+    double[][] constantMatrix = null; // The cache in the CachingPairHMM
+    double[][] distanceMatrix = null; // The cache in the CachingPairHMM
+
+    protected static final double [] firstRowConstantMatrix = {
+            QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)),
+            QualityUtils.qualToProbLog10(DEFAULT_GCP),
+            QualityUtils.qualToErrorProbLog10(DEFAULT_GOP),
+            QualityUtils.qualToErrorProbLog10(DEFAULT_GCP),
+            0.0,
+            0.0
+    };
+
+    @Override
+    public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
+
+        super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH);
+
+        // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
+        final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
+        final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
+
+        constantMatrix = new double[X_METRIC_LENGTH][6];
+        distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
+
+        // fill in the first row
+        for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
+            updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
+        }
+    }
+
+    @Override
+    public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
+                                                            final byte[] readBases,
+                                                            final byte[] readQuals,
+                                                            final byte[] insertionGOP,
+                                                            final byte[] deletionGOP,
+                                                            final byte[] overallGCP,
+                                                            final int hapStartIndex,
+                                                            final boolean recacheReadValues ) {
+
+        if( recacheReadValues ) {
+            initializeConstants( insertionGOP, deletionGOP, overallGCP );
+        }
+        initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
+
+        // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
+        final int X_METRIC_LENGTH = readBases.length + 2;
+        final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
+
+        for (int i = 2; i < X_METRIC_LENGTH; i++) {
+            for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
+                updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
+            }
+        }
+
+        // final probability is the log10 sum of the last element in all three state arrays
+        final int endI = X_METRIC_LENGTH - 1;
+        final int endJ = Y_METRIC_LENGTH - 1;
+        return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
+    }
+
+    /**
+     * Initializes the matrix that holds all the constants related to the editing
+     * distance between the read and the haplotype.
+     *
+     * @param haplotypeBases the bases of the haplotype
+     * @param readBases      the bases of the read
+     * @param readQuals      the base quality scores of the read
+     * @param startIndex     where to start updating the distanceMatrix (in case this read is similar to the previous read)
+     */
+    public void initializeDistanceMatrix( final byte[] haplotypeBases,
+                                          final byte[] readBases,
+                                          final byte[] readQuals,
+                                          final int startIndex ) {
+
+        // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
+        // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
+
+        for (int i = 0; i < readBases.length; i++) {
+            final byte x = readBases[i];
+            final byte qual = readQuals[i];
+            for (int j = startIndex; j < haplotypeBases.length; j++) {
+                final byte y = haplotypeBases[j];
+                distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
+                        QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
+            }
+        }
+    }
+
+    /**
+     * Initializes the matrix that holds all the constants related to quality scores.
+     *
+     * @param insertionGOP   insertion quality scores of the read
+     * @param deletionGOP    deletion quality scores of the read
+     * @param overallGCP     overall gap continuation penalty
+     */
+    public void initializeConstants( final byte[] insertionGOP,
+                                     final byte[] deletionGOP,
+                                     final byte[] overallGCP ) {
+
+        final int l = insertionGOP.length;
+        constantMatrix[1] = firstRowConstantMatrix;
+        for (int i = 0; i < l; i++) {
+            final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
+            constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP);
+            constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]);
+            constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]);
+            constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
+            constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]);
+            constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
+        }
+        constantMatrix[l+1][4] = 0.0;
+        constantMatrix[l+1][5] = 0.0;
+    }
+
+    /**
+     * Updates a cell in the HMM matrix
+     *
+     * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
+     * initial conditions
+
+     * @param indI             row index in the matrices to update
+     * @param indJ             column index in the matrices to update
+     * @param prior            the likelihood editing distance matrix for the read x haplotype
+     * @param constants        an array with the six constants relevant to this location
+     * @param matchMetricArray the matches likelihood matrix
+     * @param XMetricArray     the insertions likelihood matrix
+     * @param YMetricArray     the deletions likelihood matrix
+     */
+    private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
+                             final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
+
+        matchMetricArray[indI][indJ] = prior +
+                MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0],
+                        XMetricArray[indI - 1][indJ - 1] + constants[1],
+                        YMetricArray[indI - 1][indJ - 1] + constants[1] );
+        XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2],
+                XMetricArray[indI - 1][indJ] + constants[3]);
+        YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4],
+                YMetricArray[indI][indJ - 1] + constants[5]);
+    }
+}
diff --git a/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java
new file mode 100644
index 000000000..d2aef5bb5
--- /dev/null
+++ b/protected/java/src/org/broadinstitute/sting/utils/pairhmm/LoglessCachingPairHMM.java
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.utils.pairhmm;
+
+import org.broadinstitute.sting.utils.QualityUtils;
+
+import java.util.Arrays;
+
+/**
+ * Created with IntelliJ IDEA.
+ * User: rpoplin, carneiro
+ * Date: 10/16/12
+ */
+
+public class LoglessCachingPairHMM extends CachingPairHMM {
+
+    protected static final double SCALE_FACTOR_LOG10 = 300.0;
+
+    protected static final double [] firstRowConstantMatrix = {
+            QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)),
+            QualityUtils.qualToProb(DEFAULT_GCP),
+            QualityUtils.qualToErrorProb(DEFAULT_GOP),
+            QualityUtils.qualToErrorProb(DEFAULT_GCP),
+            1.0,
+            1.0
+    };
+
+    @Override
+    public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
+
+        // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
+        final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
+        final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
+
+        matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
+        XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
+        YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
+
+        for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
+            Arrays.fill(matchMetricArray[iii], 0.0);
+            Arrays.fill(XMetricArray[iii], 0.0);
+            Arrays.fill(YMetricArray[iii], 0.0);
+        }
+
+        // the initial condition
+        matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0);
+
+        constantMatrix = new double[X_METRIC_LENGTH][6];
+        distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
+
+        // fill in the first row
+        for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
+            updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
+        }
+    }
+
+    @Override
+    public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
+                                                            final byte[] readBases,
+                                                            final byte[] readQuals,
+                                                            final byte[] insertionGOP,
+                                                            final byte[] deletionGOP,
+                                                            final byte[] overallGCP,
+                                                            final int hapStartIndex,
+                                                            final boolean recacheReadValues ) {
+
+        if( recacheReadValues ) {
+            initializeConstants( insertionGOP, deletionGOP, overallGCP );
+        }
+        initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
+
+        // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
+        final int X_METRIC_LENGTH = readBases.length + 2;
+        final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
+
+        for (int i = 2; i < X_METRIC_LENGTH; i++) {
+            for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
+                updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
+            }
+        }
+
+        // final probability is the log10 sum of the last element in all three state arrays
+        final int endI = X_METRIC_LENGTH - 1;
+        final int endJ = Y_METRIC_LENGTH - 1;
+        return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10;
+    }
+
+    /**
+     * Initializes the matrix that holds all the constants related to the editing
+     * distance between the read and the haplotype.
+     *
+     * @param haplotypeBases the bases of the haplotype
+     * @param readBases      the bases of the read
+     * @param readQuals      the base quality scores of the read
+     * @param startIndex     where to start updating the distanceMatrix (in case this read is similar to the previous read)
+     */
+    public void initializeDistanceMatrix( final byte[] haplotypeBases,
+                                          final byte[] readBases,
+                                          final byte[] readQuals,
+                                          final int startIndex ) {
+
+        // initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
+        // Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
+
+        for (int i = 0; i < readBases.length; i++) {
+            final byte x = readBases[i];
+            final byte qual = readQuals[i];
+            for (int j = startIndex; j < haplotypeBases.length; j++) {
+                final byte y = haplotypeBases[j];
+                distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
+                        QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
+            }
+        }
+    }
+
+    /**
+     * Initializes the matrix that holds all the constants related to quality scores.
+     *
+     * @param insertionGOP   insertion quality scores of the read
+     * @param deletionGOP    deletion quality scores of the read
+     * @param overallGCP     overall gap continuation penalty
+     */
+    public void initializeConstants( final byte[] insertionGOP,
+                                     final byte[] deletionGOP,
+                                     final byte[] overallGCP ) {
+
+        final int l = insertionGOP.length;
+        constantMatrix[1] = firstRowConstantMatrix;
+        for (int i = 0; i < l; i++) {
+            final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
+            constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
+            constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
+            constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
+            constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
+            constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
+            constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
+        }
+        constantMatrix[l+1][4] = 1.0;
+        constantMatrix[l+1][5] = 1.0;
+    }
+
+    /**
+     * Updates a cell in the HMM matrix
+     *
+     * The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
+     * initial conditions
+
+     * @param indI             row index in the matrices to update
+     * @param indJ             column index in the matrices to update
+     * @param prior            the likelihood editing distance matrix for the read x haplotype
+     * @param constants        an array with the six constants relevant to this location
+     * @param matchMetricArray the matches likelihood matrix
+     * @param XMetricArray     the insertions likelihood matrix
+     * @param YMetricArray     the deletions likelihood matrix
+     */
+    private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
+                             final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
+
+        matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] +
+                                                 XMetricArray[indI - 1][indJ - 1] * constants[1] +
+                                                 YMetricArray[indI - 1][indJ - 1] * constants[1] );
+        XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3];
+        YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5];
+    }
+}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
index 580667ee2..b839382dc 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java
@@ -5,7 +5,9 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
 
 /**
  * @author ebanks
@@ -34,7 +36,6 @@ public class BQSRIntegrationTest extends WalkerTest {
                     " -I " + bam +
                     " -L " + interval +
                     args +
-                    " --no_plots" +
                     " -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) +
                     " -o %s";
         }
@@ -50,21 +51,21 @@ public class BQSRIntegrationTest extends WalkerTest {
         String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
         String HiSeqInterval = "chr1:10,000,000-10,100,000";
         return new Object[][]{
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "1cfc73371abb933ca26496745d105ff0")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "ee5142776008741b1b2453b1258c6d99")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "fbc520794f0f98d52159de956f7217f1")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab5b93794049c514bf8e407019d76b67")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "81df636e3d0ed6f16113517e0169bc96")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "ad3c47355448f8c45e172c6e1129c65d")},
-                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "fef7240140a9b6d6335ce009fa4edec5")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "769f95b9dcc78a405d3e6b191e5a19f5")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "43fcba51264cc98bd8466d21e1b96766")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "48aaf9ac54b97eac3663882a59354ab2")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "dac04b9e1e1c52af8d3a50c2e550fda9")},
-                {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "90d70542076715a8605a8d4002614b34")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
-                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "26a04f5a28c40750c603cbe8a926d7bd")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")},
+                {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")},
+                {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "f70d8b5358bc2f76696f14b7a807ede0")},
+                {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "5e657fd6a44dcdc7674b6e5a2de5dc83")},
         };
     }
 
@@ -74,12 +75,6 @@ public class BQSRIntegrationTest extends WalkerTest {
                 params.getCommandLine(),
                 Arrays.asList(params.md5));
         executeTest("testBQSR-"+params.args, spec).getFirst();
-
-        // TODO -- re-enable once parallelization is fixed in BaseRecalibrator
-        //WalkerTestSpec specNT2 = new WalkerTestSpec(
-        //        params.getCommandLine() + " -nt 2",
-        //        Arrays.asList(params.md5));
-        //executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
     }
 
     @Test
@@ -89,7 +84,6 @@ public class BQSRIntegrationTest extends WalkerTest {
                         " -R " + b36KGReference +
                         " -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" +
                         " -L 1:10,000,000-10,200,000" +
-                        " --no_plots" +
                         " -o %s",
                 1, // just one output file
                 UserException.CommandLineException.class);
@@ -103,7 +97,6 @@ public class BQSRIntegrationTest extends WalkerTest {
                         " -R " + b36KGReference +
                         " -I " + privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam" +
                         " -L 1:50,000-80,000" +
-                        " --no_plots" +
                         " -o %s",
                 1, // just one output file
                 UserException.class);
@@ -127,21 +120,27 @@ public class BQSRIntegrationTest extends WalkerTest {
 
     @DataProvider(name = "PRTest")
     public Object[][] createPRTestData() {
-        return new Object[][]{
-                {new PRTest("", "d2d6ed8667cdba7e56f5db97d6262676")},
-                {new PRTest(" -qq -1", "b7053d3d67aba6d8892f0a60f0ded338")},
-                {new PRTest(" -qq 6", "bfbf0855185b2b70aa35237fb71e4487")},
-                {new PRTest(" -DIQ", "66aa65223f192ee39c1773aa187fd493")}
-        };
+        List tests = new ArrayList();
+
+        tests.add(new Object[]{1, new PRTest(" -qq -1", "5226c06237b213b9e9b25a32ed92d09a")});
+        tests.add(new Object[]{1, new PRTest(" -qq 6", "b592a5c62b952a012e18adb898ea9c33")});
+        tests.add(new Object[]{1, new PRTest(" -DIQ", "8977bea0c57b808e65e9505eb648cdf7")});
+
+        for ( final int nct : Arrays.asList(1, 2, 4) ) {
+            tests.add(new Object[]{nct, new PRTest("", "ab2f209ab98ad3432e208cbd524a4c4a")});
+        }
+
+        return tests.toArray(new Object[][]{});
     }
 
     @Test(dataProvider = "PRTest")
-    public void testPR(PRTest params) {
+    public void testPR(final int nct, PRTest params) {
         WalkerTestSpec spec = new WalkerTestSpec(
                 "-T PrintReads" +
                         " -R " + hg18Reference +
                         " -I " + privateTestDir + "HiSeq.1mb.1RG.bam" +
-                        " -BQSR " + privateTestDir + "HiSeq.1mb.1RG.table" +
+                        " -nct " + nct +
+                        " -BQSR " + privateTestDir + "HiSeq.20mb.1RG.table" +
                         params.args +
                         " -o %s",
                 Arrays.asList(params.md5));
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java
index a8707641a..3e5cbf0e8 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java
@@ -63,7 +63,7 @@ public class BaseCountsUnitTest extends BaseTest {
 
         String name = String.format("Test-%s", params.bases);
         Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
-        Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
+        Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name);
         Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
     }
 }
\ No newline at end of file
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java
index 3f1cc7a3c..db8ea4eb8 100755
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java
@@ -21,33 +21,33 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
         executeTest(testName, spec);
     }
 
-    @Test(enabled = true)
+    @Test(enabled = false)
     public void testDefaultCompression() {
         RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f");
     }
 
-    @Test(enabled = true)
+    @Test(enabled = false)
     public void testMultipleIntervals() {
         String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
         RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76");
     }
 
-    @Test(enabled = true)
+    @Test(enabled = false)
     public void testHighCompression() {
         RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047");
     }
 
-    @Test(enabled = true)
+    @Test(enabled = false)
     public void testLowCompression() {
         RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63");
     }
 
-    @Test(enabled = true)
+    @Test(enabled = false)
     public void testIndelCompression() {
         RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973");
     }
 
-    @Test(enabled = true)
+    @Test(enabled = false)
     public void testFilteredDeletionCompression() {
         String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
         executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
@@ -61,7 +61,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
      * 
      * This bam is simplified to replicate the exact bug with the three provided intervals.
      */
-    @Test(enabled = true)
+    @Test(enabled = false)
     public void testAddingReadAfterTailingTheStash() {
         String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
         executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6")));
@@ -71,7 +71,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
      * Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get
      * filtered out.
      */
-    @Test(enabled = true)
+    @Test(enabled = false)
     public void testDivideByZero() {
         String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
         executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2")));
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java
index e651c018c..738fe4a2e 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SyntheticReadUnitTest.java
@@ -35,7 +35,7 @@ public void testBaseCounts() {
                 new TestRead(bases, quals, new Byte[] {1, 127, 51, 126},    new byte [] {1, 126, 50, 125})};
 
         for (TestRead testRead : testReads) {
-            SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false);
+            SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false);
             Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts());
         }
 }
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java
index 6ae34f190..73bc8fba6 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java
@@ -1,9 +1,9 @@
 package org.broadinstitute.sting.gatk.walkers.genotyper;
 
 import org.broadinstitute.sting.WalkerTest;
+import org.testng.annotations.Test;
 
 import java.util.Arrays;
-import org.testng.annotations.Test;
 
 /**
  * Created by IntelliJ IDEA.
@@ -18,8 +18,9 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
     final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
     final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
     final String REFSAMPLE_NAME = "NA12878";
-    final String MTINTERVALS = "MT:1-3000";
-    final String LSVINTERVALS = "20:40,000,000-41,000,000";
+    final String MTINTERVALS = "MT:1-1000";
+    final String LSVINTERVALS = "20:40,500,000-41,000,000";
+    final String LSVINTERVALS_SHORT = "20:40,500,000-40,501,000";
     final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
     final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
     final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
@@ -38,6 +39,13 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
         executeTest("testPoolCaller:"+name+" args=" + args, spec);
     }
 
+    private void PC_LSV_Test_short(String args, String name, String model, String md5) {
+        final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane ",
+                REF, LSV_BAM, LSVINTERVALS_SHORT, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
+        final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
+        executeTest("testPoolCaller:"+name+" args=" + args, spec);
+    }
+
     private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) {
         final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane",
                 REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s";
@@ -45,33 +53,38 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
         executeTest("testPoolCaller:"+name+" args=" + args, spec);
     }
 
+    @Test(enabled = true)
+    public void testSNP_ACS_Pools() {
+        PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","df0e67c975ef74d593f1c704daab1705");
+    }
+
     @Test(enabled = true)
     public void testBOTH_GGA_Pools() {
-        PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0934f72865388999efec64bd9d4a9b93");
+        PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","7e5b28c9e21cc7e45c58c41177d8a0fc");
     }
 
     @Test(enabled = true)
     public void testINDEL_GGA_Pools() {
-        PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES  -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","126581c72d287722437274d41b6fed7b");
+        PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES  -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","ae6c276cc46785a794acff6f7d10ecf7");
     }
 
     @Test(enabled = true)
     public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
-        PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","b543aa1c3efedb301e525c1d6c50ed8d");
+        PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","481452ad7d6378cffb5cd834cc621d55");
     }
 
     @Test(enabled = true)
     public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
-        PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","55b20557a836bb92688e68f12d7f5dc4");
+        PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","812957e51277aca9925c1a7bb4d9a118");
     }
 
     @Test(enabled = true)
     public void testMT_SNP_DISCOVERY_sp4() {
-         PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","7eb889e8e07182f4c3d64609591f9459");
+         PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","dd568dc30be90135a3a8957a45a7321c");
     }
 
     @Test(enabled = true)
     public void testMT_SNP_GGA_sp10() {
-        PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES  -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "db8114877b99b14f7180fdcd24b040a7");
+        PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES  -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "bf793c43b635a931207170be8035b288");
     }
 }
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java
new file mode 100644
index 000000000..556b7451f
--- /dev/null
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcPerformanceUnitTest.java
@@ -0,0 +1,87 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
+
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.collections.Pair;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+public class AFCalcPerformanceUnitTest extends BaseTest {
+    @DataProvider(name = "ScalingTests")
+    public Object[][] makepolyTestProviderLotsOfAlleles() {
+        List tests = new ArrayList();
+
+        // list of all high-quality models in the system
+        final List biAllelicModels = Arrays.asList(
+                AFCalcFactory.Calculation.EXACT_INDEPENDENT,
+                AFCalcFactory.Calculation.EXACT_REFERENCE);
+
+        final List multiAllelicModels = Arrays.asList(
+                AFCalcFactory.Calculation.EXACT_INDEPENDENT);
+
+//        for ( final int nonTypePLs : Arrays.asList(100) ) {
+//            for ( final int nSamples : Arrays.asList(10000) ) {
+//                final List alleleCounts = Arrays.asList(50);
+//                for ( final int nAltAlleles : Arrays.asList(1) ) {
+        for ( final int nonTypePLs : Arrays.asList(100) ) {
+            for ( final int nSamples : Arrays.asList(100, 1000) ) {
+                final List alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 50, 500);
+                    for ( final int nAltAlleles : Arrays.asList(1, 2, 3) ) {
+                    final List models = nAltAlleles > 1 ? multiAllelicModels : biAllelicModels;
+                    for ( final AFCalcFactory.Calculation model : models ) {
+                        for ( final List ACs : Utils.makePermutations(alleleCounts, nAltAlleles, true) ) {
+                            if ( MathUtils.sum(ACs) < nSamples * 2 ) {
+                                final AFCalcTestBuilder testBuilder
+                                        = new AFCalcTestBuilder(nSamples, nAltAlleles, model, AFCalcTestBuilder.PriorType.human);
+                                tests.add(new Object[]{testBuilder, ACs, nonTypePLs});
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    private Pair estNumberOfEvaluations(final AFCalcTestBuilder testBuilder, final VariantContext vc, final int nonTypePL) {
+        final int evalOverhead = 2; // 2
+        final int maxEvalsPerSamplePerAC = 3;
+
+        int minEvals = 0, maxEvals = 0;
+
+        for ( final Allele alt : vc.getAlternateAlleles() ) {
+            final int AC = vc.getCalledChrCount(alt);
+            minEvals += AC + evalOverhead; // everyone is hom-var
+            maxEvals += AC * maxEvalsPerSamplePerAC + 10;
+        }
+
+        return new Pair(minEvals, maxEvals);
+    }
+
+    @Test(dataProvider = "ScalingTests")
+    private void testScaling(final AFCalcTestBuilder testBuilder, final List ACs, final int nonTypePL) {
+        final AFCalc calc = testBuilder.makeModel();
+        final double[] priors = testBuilder.makePriors();
+        final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
+        final AFCalcResult result = calc.getLog10PNonRef(vc, priors);
+        final Pair expectedNEvaluation = estNumberOfEvaluations(testBuilder, vc, nonTypePL);
+        final int minEvals = expectedNEvaluation.getFirst();
+        final int maxEvals = expectedNEvaluation.getSecond();
+
+        logger.warn(" min " + minEvals + " obs " + result.getnEvaluations() + " max " + maxEvals + " for test " + testBuilder + " sum(ACs)=" + (int)MathUtils.sum(ACs));
+
+        Assert.assertTrue(result.getnEvaluations() >= minEvals,
+                "Actual number of evaluations " + result.getnEvaluations() + " < min number of evals " + minEvals);
+        Assert.assertTrue(result.getnEvaluations() <= maxEvals,
+                "Actual number of evaluations " + result.getnEvaluations() + " > max number of evals " + minEvals);
+    }
+}
\ No newline at end of file
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java
new file mode 100644
index 000000000..cbe2eb268
--- /dev/null
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java
@@ -0,0 +1,82 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
+
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+public class AFCalcResultUnitTest extends BaseTest {
+    private static class MyTest {
+        final double[] Ls, expectedPosteriors;
+
+        private MyTest(double[] ls, double[] expectedPosteriors) {
+            Ls = ls;
+            this.expectedPosteriors = expectedPosteriors;
+        }
+
+        @Override
+        public String toString() {
+            return "Ls [" + Utils.join(",", Ls) + "] expectedPosteriors [" + Utils.join(",", expectedPosteriors) + "]";
+        }
+    }
+
+    @DataProvider(name = "TestComputePosteriors")
+    public Object[][] makeTestCombineGLs() {
+        List tests = new ArrayList();
+
+        tests.add(new Object[]{new MyTest(log10Even, log10Even)});
+
+        for ( double L0 = -1e9; L0 < 0.0; L0 /= 10.0 ) {
+            for ( double L1 = -1e2; L1 < 0.0; L1 /= 100.0 ) {
+                final double[] input = new double[]{L0, L1};
+                final double[] expected = MathUtils.normalizeFromLog10(input, true);
+                tests.add(new Object[]{new MyTest(input, expected)});
+            }
+        }
+
+        for ( double bigBadL = -1e50; bigBadL < -1e200; bigBadL *= 10 ) {
+            // test that a huge bad likelihood remains, even with a massive better result
+            for ( final double betterL : Arrays.asList(-1000.0, -100.0, -10.0, -1.0, -0.1, -0.01, -0.001, 0.0)) {
+                tests.add(new Object[]{new MyTest(new double[]{bigBadL, betterL}, new double[]{bigBadL, 0.0})});
+                tests.add(new Object[]{new MyTest(new double[]{betterL, bigBadL}, new double[]{0.0, bigBadL})});
+            }
+        }
+
+        // test that a modest bad likelihood with an ~0.0 value doesn't get lost
+        for ( final double badL : Arrays.asList(-10000.0, -1000.0, -100.0, -10.0)) {
+            tests.add(new Object[]{new MyTest(new double[]{badL, -1e-9}, new double[]{badL, 0.0})});
+            tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})});
+        }
+
+        // test that a non-ref site gets reasonable posteriors with an ~0.0 value doesn't get lost
+        for ( final double nonRefL : Arrays.asList(-100.0, -50.0, -10.0, -9.0, -8.0, -7.0, -6.0, -5.0)) {
+            tests.add(new Object[]{new MyTest(new double[]{0.0, nonRefL}, new double[]{0.0, nonRefL})});
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+
+    final static double[] log10Even = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true);
+    final static Allele C = Allele.create("C");
+    final static List alleles = Arrays.asList(Allele.create("A", true), C);
+
+    @Test(enabled = true, dataProvider = "TestComputePosteriors")
+    private void testComputingPosteriors(final MyTest data) {
+        final AFCalcResult result = new AFCalcResult(new int[]{0}, 1, alleles, data.Ls, log10Even, Collections.singletonMap(C, -1.0));
+
+        Assert.assertEquals(result.getLog10PosteriorOfAFEq0(), data.expectedPosteriors[0], 1e-3, "AF = 0 not expected");
+        Assert.assertEquals(result.getLog10PosteriorOfAFGT0(), data.expectedPosteriors[1], 1e-3, "AF > 0 not expected");
+
+        final double[] actualPosteriors = new double[]{result.getLog10PosteriorOfAFEq0(), result.getLog10PosteriorOfAFGT0()};
+        Assert.assertEquals(MathUtils.sumLog10(actualPosteriors), 1.0, 1e-3, "Posteriors don't sum to 1 with 1e-3 precision");
+    }
+}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java
new file mode 100644
index 000000000..2d346e548
--- /dev/null
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcUnitTest.java
@@ -0,0 +1,687 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
+
+import org.apache.commons.lang.ArrayUtils;
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.QualityUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.variantcontext.*;
+import org.testng.Assert;
+import org.testng.annotations.BeforeSuite;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.util.*;
+
+
+public class AFCalcUnitTest extends BaseTest {
+    static Allele A = Allele.create("A", true);
+    static Allele C = Allele.create("C");
+    static Allele G = Allele.create("G");
+
+    static int sampleNameCounter = 0;
+    static Genotype AA1, AB1, BB1, NON_INFORMATIVE1;
+    static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2;
+    final double[] FLAT_3SAMPLE_PRIORS = MathUtils.normalizeFromLog10(new double[2*3+1], true);  // flat priors
+
+    final private static boolean INCLUDE_BIALLELIC = true;
+    final private static boolean INCLUDE_TRIALLELIC = true;
+    final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug
+    final private static boolean DEBUG_ONLY = false;
+
+    @BeforeSuite
+    public void before() {
+        AA1 = makePL(Arrays.asList(A, A), 0, 20, 20);
+        AB1 = makePL(Arrays.asList(A, C), 20, 0, 20);
+        BB1 = makePL(Arrays.asList(C, C), 20, 20, 0);
+        NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
+
+        AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20);
+        AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20);
+        BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20);
+        AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20);
+        BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20);
+        CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0);
+        NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0);
+    }
+
+    protected static Genotype makePL(final List expectedGT, int ... pls) {
+        GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
+        gb.alleles(expectedGT);
+        gb.PL(pls);
+        return gb.make();
+    }
+
+    private class GetGLsTest extends TestDataProvider {
+        GenotypesContext GLs;
+        int numAltAlleles;
+        final AFCalc calc;
+        final int[] expectedACs;
+        final double[] priors;
+        final String priorName;
+
+        private GetGLsTest(final AFCalc calc, int numAltAlleles, List arg, final double[] priors, final String priorName) {
+            super(GetGLsTest.class);
+            GLs = GenotypesContext.create(new ArrayList(arg));
+            this.numAltAlleles = numAltAlleles;
+            this.calc = calc;
+            this.priors = priors;
+            this.priorName = priorName;
+
+            expectedACs = new int[numAltAlleles+1];
+            for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) {
+                expectedACs[alleleI] = 0;
+                final Allele allele = getAlleles().get(alleleI);
+                for ( Genotype g : arg ) {
+                    expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele);
+                }
+            }
+        }
+
+        public AFCalcResult execute() {
+            return getCalc().getLog10PNonRef(getVC(), getPriors());
+        }
+
+        public AFCalcResult executeRef() {
+            final AFCalc ref = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_REFERENCE, getCalc().nSamples, getCalc().getMaxAltAlleles());
+            return ref.getLog10PNonRef(getVC(), getPriors());
+        }
+
+        public double[] getPriors() {
+            return priors;
+        }
+
+        public AFCalc getCalc() {
+            return calc;
+        }
+
+        public VariantContext getVC() {
+            VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
+            builder.genotypes(GLs);
+            return builder.make();
+        }
+
+        public List getAlleles() {
+            return Arrays.asList(Allele.create("A", true),
+                    Allele.create("C"),
+                    Allele.create("G"),
+                    Allele.create("T")).subList(0, numAltAlleles+1);
+        }
+
+        public int getExpectedAltAC(final int alleleI) {
+            return expectedACs[alleleI+1];
+        }
+
+        public String toString() {
+            return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(),
+                    priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs);
+        }
+    }
+
+    @DataProvider(name = "wellFormedGLs")
+    public Object[][] createSimpleGLsData() {
+        final List biAllelicSamples = Arrays.asList(AA1, AB1, BB1);
+        final List triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
+
+        for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
+            List calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2);
+
+            final int nPriorValues = 2*nSamples+1;
+            final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true);  // flat priors
+            final double[] humanPriors = new double[nPriorValues];
+            UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
+
+            for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
+                for ( AFCalc model : calcs ) {
+                    final String priorName = priors == humanPriors ? "human" : "flat";
+
+                    // bi-allelic
+                    if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() )
+                        for ( List genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) )
+                            new GetGLsTest(model, 1, genotypes, priors, priorName);
+
+                    // tri-allelic
+                    if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) )
+                        for ( List genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
+                            new GetGLsTest(model, 2, genotypes, priors, priorName);
+                }
+            }
+        }
+
+        return GetGLsTest.getTests(GetGLsTest.class);
+    }
+
+//    @DataProvider(name = "badGLs")
+//    public Object[][] createBadGLs() {
+//        final List genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
+//        final int nSamples = genotypes.size();
+//
+//        final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
+//
+//        final int nPriorValues = 2*nSamples+1;
+//        final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true);  // flat priors
+//        for ( AFCalc model : Arrays.asList(indCalc) ) {
+//            final String priorName = "flat";
+//            new GetGLsTest(model, 2, genotypes, priors, priorName);
+//        }
+//
+//        return GetGLsTest.getTests(GetGLsTest.class);
+//    }
+
+//
+//    @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs")
+//    public void testBadGLs(GetGLsTest cfg) {
+//        testResultSimple(cfg);
+//    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
+    public void testBiallelicGLs(GetGLsTest cfg) {
+        if ( cfg.getAlleles().size() == 2 )
+            testResultSimple(cfg);
+    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
+    public void testTriallelicGLs(GetGLsTest cfg) {
+        if ( cfg.getAlleles().size() > 2 )
+            testResultSimple(cfg);
+    }
+
+    private static class NonInformativeData {
+        final Genotype nonInformative;
+        final List called;
+        final int nAltAlleles;
+
+        private NonInformativeData(List called, Genotype nonInformative, int nAltAlleles) {
+            this.called = called;
+            this.nonInformative = nonInformative;
+            this.nAltAlleles = nAltAlleles;
+        }
+    }
+
+    @DataProvider(name = "GLsWithNonInformative")
+    public Object[][] makeGLsWithNonInformative() {
+        List tests = new ArrayList();
+
+        final List nonInformativeTests = new LinkedList();
+        nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1));
+        nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2));
+        nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2));
+
+        for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) {
+            for ( final NonInformativeData testData : nonInformativeTests ) {
+                final List samples = new ArrayList();
+                samples.addAll(testData.called);
+                samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
+
+                final int nSamples = samples.size();
+                List calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2);
+
+                final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true);  // flat priors
+
+                for ( AFCalc model : calcs ) {
+                    if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc )
+                        continue;
+
+                    final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
+
+                    for ( int rotation = 0; rotation < nSamples; rotation++ ) {
+                        Collections.rotate(samples, 1);
+                        final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat");
+                        tests.add(new Object[]{onlyInformative, withNonInformative});
+                    }
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "GLsWithNonInformative", dependsOnMethods = {"testBiallelicGLs", "testTriallelicGLs"})
+    public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
+        final AFCalcResult expected = onlyInformative.execute();
+        final AFCalcResult actual = withNonInformative.execute();
+
+        testResultSimple(withNonInformative);
+        compareAFCalcResults(actual, expected, onlyInformative.getCalc(), true);
+    }
+
+    private void testResultSimple(final GetGLsTest cfg) {
+        final AFCalcResult refResultTracker = cfg.executeRef();
+        final AFCalcResult resultTracker = cfg.execute();
+
+        compareAFCalcResults(resultTracker, refResultTracker, cfg.getCalc(), true);
+
+        Assert.assertNotNull(resultTracker.getAllelesUsedInGenotyping());
+        Assert.assertTrue(cfg.getAlleles().containsAll(resultTracker.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
+
+        for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) {
+            int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI);
+            int calcAC_MLE = resultTracker.getAlleleCountsOfMLE()[altAlleleI];
+
+            final Allele allele = cfg.getAlleles().get(altAlleleI+1);
+            Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
+        }
+    }
+
+    private void compareAFCalcResults(final AFCalcResult actual, final AFCalcResult expected, final AFCalc calc, final boolean onlyPosteriorsShouldBeEqual) {
+        // note we cannot really test the multi-allelic case because we actually meaningfully differ among the models here
+        final double TOLERANCE = calc.getMaxAltAlleles() > 1 ? 1000 : 0.1; // much tighter constraints on bi-allelic results
+
+        if ( ! onlyPosteriorsShouldBeEqual ) {
+            Assert.assertEquals(actual.getLog10PriorOfAFEq0(), expected.getLog10PriorOfAFEq0(), TOLERANCE, "Priors AF == 0");
+            Assert.assertEquals(actual.getLog10PriorOfAFGT0(), expected.getLog10PriorOfAFGT0(), TOLERANCE, "Priors AF > 0");
+            Assert.assertEquals(actual.getLog10LikelihoodOfAFEq0(), expected.getLog10LikelihoodOfAFEq0(), TOLERANCE, "Likelihoods AF == 0");
+            Assert.assertEquals(actual.getLog10LikelihoodOfAFGT0(), expected.getLog10LikelihoodOfAFGT0(), TOLERANCE, "Likelihoods AF > 0");
+        }
+        Assert.assertEquals(actual.getLog10PosteriorOfAFEq0(), expected.getLog10PosteriorOfAFEq0(), TOLERANCE, "Posteriors AF == 0");
+        Assert.assertEquals(actual.getLog10PosteriorOfAFGT0(), expected.getLog10PosteriorOfAFGT0(), TOLERANCE, "Posteriors AF > 0");
+        Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE(), "MLE ACs");
+        Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping(), "Alleles used in genotyping");
+
+        for ( final Allele a : expected.getAllelesUsedInGenotyping() ) {
+            if ( ! a.isReference() ) {
+                Assert.assertEquals(actual.getAlleleCountAtMLE(a), expected.getAlleleCountAtMLE(a), "MLE AC for allele " + a);
+                // TODO -- enable me when IndependentAllelesDiploidExactAFCalc works properly
+//                if ( ! ( calc instanceof GeneralPloidyExactAFCalc ) )
+//                    // TODO -- delete when general ploidy works properly with multi-allelics
+//                    Assert.assertEquals(actual.isPolymorphic(a, 0.0), expected.isPolymorphic(a, 0.0), "isPolymorphic with thread 0.0 for allele " + a);
+            }
+        }
+    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
+    public void testLargeGLs(final ExactAFCalc calc) {
+        final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
+        GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
+
+        final AFCalcResult resultTracker = cfg.execute();
+
+        int calculatedAlleleCount = resultTracker.getAlleleCountsOfMLE()[0];
+        Assert.assertEquals(calculatedAlleleCount, 6);
+    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
+    public void testMismatchedGLs(final ExactAFCalc calc) {
+        final Genotype AB = makePL(Arrays.asList(A, C), 2000, 0, 2000, 2000, 2000, 2000);
+        final Genotype AC = makePL(Arrays.asList(A, G), 100, 100, 100, 0, 100, 100);
+        GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat");
+
+        final AFCalcResult resultTracker = cfg.execute();
+
+        Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[0], 1);
+        Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[1], 1);
+    }
+
+    // --------------------------------------------------------------------------------
+    //
+    // Code to test that the pNonRef value is meaningful
+    //
+    // --------------------------------------------------------------------------------
+
+    private static class PNonRefData {
+        final Genotype g;
+        final double pNonRef, tolerance;
+        final boolean canScale;
+        final List badModels;
+        final VariantContext vc;
+
+        private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale) {
+            this(vc, g, pNonRef, tolerance, canScale, Collections.emptyList());
+        }
+
+        private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale, final List badModels) {
+            this.g = g;
+            this.pNonRef = pNonRef;
+            this.tolerance = tolerance;
+            this.canScale = canScale;
+            this.badModels = badModels;
+            this.vc = vc;
+        }
+
+        public PNonRefData scale(final int scaleFactor) {
+            if ( canScale ) {
+                final int[] PLs = new int[g.getPL().length];
+                for ( int i = 0; i < PLs.length; i++ ) PLs[i] = g.getPL()[i] * ((int)Math.log10(scaleFactor)+1);
+                final Genotype scaledG = new GenotypeBuilder(g).PL(PLs).make();
+                final double scaledPNonRef = pNonRef < 0.5 ? pNonRef / scaleFactor : 1 - ((1-pNonRef) / scaleFactor);
+                return new PNonRefData(vc, scaledG, scaledPNonRef, tolerance, true);
+            } else {
+                return this;
+            }
+        }
+    }
+
+    @DataProvider(name = "PNonRef")
+    public Object[][] makePNonRefTest() {
+        List tests = new ArrayList();
+
+        final List AA = Arrays.asList(A, A);
+        final List AC = Arrays.asList(A, C);
+        final List CC = Arrays.asList(C, C);
+        final List AG = Arrays.asList(A, G);
+        final List GG = Arrays.asList(G, G);
+        final List CG = Arrays.asList(C, G);
+
+        final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
+        final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
+        final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat;
+
+        final double TOLERANCE = 0.5;
+
+        final List initialPNonRefData = Arrays.asList(
+                // bi-allelic sites
+                new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true),
+                new PNonRefData(vc2, makePL(AA, 0,  1, 10), 0.4721084, TOLERANCE, false),
+                new PNonRefData(vc2, makePL(AA, 0,  1,  1), 0.6136992, TOLERANCE, false),
+                new PNonRefData(vc2, makePL(AA, 0,  5,  5), 0.3874259, TOLERANCE, false),
+                new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true),
+                new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true),
+
+                // tri-allelic sites -- cannot scale because of the naivety of our scaling estimator
+                new PNonRefData(vc3, makePL(AA, 0, 10, 10, 10, 10, 10), 0.3023255813953489, TOLERANCE * 2, false), // more tolerance because constrained model is a bit inaccurate
+                new PNonRefData(vc3, makePL(AC, 10, 0, 10, 10, 10, 10), 0.9166667, TOLERANCE, false),
+                new PNonRefData(vc3, makePL(CC, 10, 10, 0, 10, 10, 10), 0.9166667, TOLERANCE, false),
+                new PNonRefData(vc3, makePL(AG, 10, 10, 10, 0, 10, 10), 0.9166667, TOLERANCE, false),
+                new PNonRefData(vc3, makePL(CG, 10, 10, 10, 10, 0, 10), 0.80, TOLERANCE, false),
+                new PNonRefData(vc3, makePL(GG, 10, 10, 10, 10, 10, 0), 0.9166667, TOLERANCE, false)
+        );
+
+        for ( AFCalcFactory.Calculation modelType : Arrays.asList(AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT) ) {
+            for ( int nNonInformative = 0; nNonInformative < 3; nNonInformative++ ) {
+                for ( final PNonRefData rootData : initialPNonRefData ) {
+                    for ( int plScale = 1; plScale <= 100000; plScale *= 10 ) {
+                        if ( ! rootData.badModels.contains(modelType) && (plScale == 1 || rootData.canScale) ) {
+                            final PNonRefData data = rootData.scale(plScale);
+                            tests.add(new Object[]{data.vc, modelType, priorType, Arrays.asList(data.g), data.pNonRef, data.tolerance, nNonInformative});
+                        }
+                    }
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRef")
+    private void testPNonRef(final VariantContext vcRoot,
+                             AFCalcFactory.Calculation modelType,
+                             AFCalcTestBuilder.PriorType priorType,
+                             final List genotypes,
+                             final double expectedPNonRef,
+                             final double tolerance,
+                             final int nNonInformative) {
+        final AFCalcTestBuilder testBuilder
+                = new AFCalcTestBuilder(1, vcRoot.getNAlleles()-1, modelType, priorType);
+
+        final VariantContextBuilder vcb = new VariantContextBuilder(vcRoot);
+        vcb.genotypes(genotypes);
+
+        final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
+
+        Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), Math.log10(expectedPNonRef), tolerance,
+                "Actual pNonRef not within tolerance " + tolerance + " of expected");
+    }
+
+    @DataProvider(name = "PNonRefBiallelicSystematic")
+    public Object[][] makePNonRefBiallelicSystematic() {
+        List tests = new ArrayList();
+
+        final List bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000);
+        final List> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true));
+
+        for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) {
+
+            if ( false ) { // for testing only
+                tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))});
+            } else {
+                if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK
+
+                // test all combinations of PLs for 1 sample
+                for ( final List> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) {
+                    tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
+                }
+
+
+                final List> smallDiploidPLs = new LinkedList>();
+                for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) {
+                    for ( int i = 0; i < 2; i++ ) {
+                        List pls = new ArrayList(Collections.nCopies(3, nonRefPL));
+                        pls.set(i, 0);
+                        smallDiploidPLs.add(pls);
+                    }
+                }
+
+                for ( final List> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) {
+                    tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    final List> removeBadPLs(List> listOfPLs) {
+        List> clean = new LinkedList>();
+
+        for ( final List PLs : listOfPLs ) {
+            int x = PLs.get(0);
+            boolean bad = false;
+            for ( int pl1 : PLs )
+                if ( pl1 > x )
+                    bad = true;
+                else
+                    x = pl1;
+            if ( ! bad ) clean.add(PLs);
+        }
+
+        return clean;
+    }
+
+    private List toGenotypes(final List> PLsPerSample) {
+        final List nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
+        final List genotypes = new ArrayList(PLsPerSample.size());
+
+        for ( final List PLs : PLsPerSample ) {
+            final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3]));
+            final int min = MathUtils.arrayMin(pls);
+            for ( int i = 0; i < pls.length; i++ ) pls[i] -= min;
+            genotypes.add(makePL(nocall, pls));
+        }
+
+        return genotypes;
+    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic")
+    private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List genotypes) {
+        //logger.warn("Running " + modelType + " with " + genotypes);
+        final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human);
+        final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human);
+
+        final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C));
+        vcb.genotypes(genotypes);
+
+        final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
+        final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
+
+        final double tolerance = 1e-3;
+        Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance,
+                "Actual pNonRef not within tolerance " + tolerance + " of expected");
+        Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(),
+                "Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE()));
+    }
+
+    // --------------------------------------------------------------------------------
+    //
+    // Test priors
+    //
+    // --------------------------------------------------------------------------------
+
+    @DataProvider(name = "Models")
+    public Object[][] makeModels() {
+        List tests = new ArrayList();
+
+        for ( final AFCalcFactory.Calculation calc : AFCalcFactory.Calculation.values() ) {
+            if ( calc.usableForParams(2, 4) )
+                tests.add(new Object[]{AFCalcFactory.createAFCalc(calc, 2, 4)});
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models")
+    public void testBiallelicPriors(final AFCalc model) {
+
+        for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
+            final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
+
+            for ( int log10NonRefPrior = 1; log10NonRefPrior < 10*REF_PL; log10NonRefPrior += 1 ) {
+                final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
+                final double nonRefPrior = (1-refPrior) / 2;
+                final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior}), true);
+                if ( ! Double.isInfinite(priors[1]) ) {
+                    GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
+                    final AFCalcResult resultTracker = cfg.execute();
+                    final int actualAC = resultTracker.getAlleleCountsOfMLE()[0];
+
+                    final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
+                    final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5);
+                    final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior));
+                    final double log10NonRefPost = Math.log10(nonRefPost);
+
+                    if ( ! Double.isInfinite(log10NonRefPost) )
+                        Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2);
+
+                    if ( nonRefPost >= 0.9 )
+                        Assert.assertTrue(resultTracker.isPolymorphic(C, -1));
+
+                    final int expectedMLEAC = 1; // the MLE is independent of the prior
+                    Assert.assertEquals(actualAC, expectedMLEAC,
+                            "actual AC with priors " + log10NonRefPrior + " not expected "
+                                    + expectedMLEAC + " priors " + Utils.join(",", priors));
+                }
+            }
+        }
+    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
+
+    // --------------------------------------------------------------------------------
+    //
+    // Test that polymorphic sites (bi and tri) are properly called
+    //
+    // --------------------------------------------------------------------------------
+
+    @DataProvider(name = "polyTestProvider")
+    public Object[][] makePolyTestProvider() {
+        List tests = new ArrayList();
+
+        // list of all high-quality models in the system
+        final List models = Arrays.asList(
+                AFCalcFactory.Calculation.getDefaultModel(),
+                AFCalcFactory.Calculation.EXACT_REFERENCE,
+                AFCalcFactory.Calculation.EXACT_INDEPENDENT);
+
+        // note that we cannot use small PLs here or the thresholds are hard to set
+        for ( final int nonTypePLs : Arrays.asList(100, 1000) ) {
+            for ( final AFCalcFactory.Calculation model : models ) {
+                for ( final int allele1AC : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) {
+                    for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
+//        for ( final int nonTypePLs : Arrays.asList(10) ) {
+//            for ( final AFCalcFactory.Calculation model : models ) {
+//                for ( final int allele1AC : Arrays.asList(100) ) {
+//                    for ( final int nSamples : Arrays.asList(1000) ) {
+                        if ( nSamples < allele1AC ) continue;
+
+                        final double pPerSample = Math.pow(10, nonTypePLs / -10.0);
+                        final double errorFreq = pPerSample * nSamples;
+                        final boolean poly1 = allele1AC > errorFreq && (nonTypePLs * allele1AC) > 30;
+
+                        // bi-allelic tests
+                        {
+                            final AFCalcTestBuilder testBuilder
+                                    = new AFCalcTestBuilder(nSamples, 1, model, AFCalcTestBuilder.PriorType.human);
+                            final List ACs = Arrays.asList(allele1AC);
+                            tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1)});
+                        }
+
+                        // multi-allelic tests
+                        for ( final int allele2AC : Arrays.asList(0, 1, 2, 10, 20, 50) ) {
+                            if ( nSamples < allele2AC || allele1AC + allele2AC > nSamples || nSamples > 100 || nSamples == 1)
+                                continue;
+
+                            final AFCalcTestBuilder testBuilder
+                                    = new AFCalcTestBuilder(nSamples, 2, model, AFCalcTestBuilder.PriorType.human);
+                            final List ACs = Arrays.asList(allele1AC, allele2AC);
+                            final boolean poly2 = allele2AC > errorFreq && (nonTypePLs * allele2AC) > 90;
+                            tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1, poly2)});
+                        }
+                    }
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProvider")
+    public void testCallingGeneral(final AFCalcTestBuilder testBuilder, final List ACs, final int nonTypePL, final List expectedPoly ) {
+        testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
+    }
+
+    @DataProvider(name = "polyTestProviderLotsOfAlleles")
+    public Object[][] makepolyTestProviderLotsOfAlleles() {
+        List tests = new ArrayList();
+
+        // list of all high-quality models in the system
+        final List models = Arrays.asList(AFCalcFactory.Calculation.EXACT_INDEPENDENT);
+
+        final List alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 20);
+
+        final int nonTypePLs = 1000;
+        final int nAlleles = 4;
+        for ( final AFCalcFactory.Calculation model : models ) {
+            for ( final List ACs : Utils.makePermutations(alleleCounts, nAlleles, true) ) {
+                final List isPoly = new ArrayList(ACs.size());
+                for ( final int ac : ACs ) isPoly.add(ac > 0);
+
+                final double acSum = MathUtils.sum(ACs);
+                for ( final int nSamples : Arrays.asList(1, 10, 100) ) {
+                    if ( nSamples < acSum ) continue;
+                    final AFCalcTestBuilder testBuilder
+                            = new AFCalcTestBuilder(nSamples, nAlleles, model, AFCalcTestBuilder.PriorType.human);
+                    tests.add(new Object[]{testBuilder, ACs, nonTypePLs, isPoly});
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProviderLotsOfAlleles")
+    public void testCallingLotsOfAlleles(final AFCalcTestBuilder testBuilder, final List ACs, final int nonTypePL, final List expectedPoly ) {
+        testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
+    }
+
+    private void testCalling(final AFCalcTestBuilder testBuilder, final List ACs, final int nonTypePL, final List expectedPoly) {
+        final AFCalc calc = testBuilder.makeModel();
+        final double[] priors = testBuilder.makePriors();
+        final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
+        final AFCalcResult result = calc.getLog10PNonRef(vc, priors);
+
+        boolean anyPoly = false;
+        for ( final boolean onePoly : expectedPoly ) anyPoly = anyPoly || onePoly;
+
+        if ( anyPoly )
+            Assert.assertTrue(result.getLog10PosteriorOfAFGT0() > -1);
+
+        for ( int altI = 1; altI < result.getAllelesUsedInGenotyping().size(); altI++ ) {
+            final int i = altI - 1;
+            final Allele alt = result.getAllelesUsedInGenotyping().get(altI);
+
+            // must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs
+            Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt));
+            Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt));
+        }
+    }
+}
\ No newline at end of file
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java
similarity index 94%
rename from protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java
rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java
index 983f562d2..3df2f7883 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidyAFCalculationModelUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/GeneralPloidyAFCalculationModelUnitTest.java
@@ -1,6 +1,7 @@
-package org.broadinstitute.sting.gatk.walkers.genotyper;
+package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
 
 import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
 import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.Genotype;
 import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
@@ -136,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
 
     @Test(dataProvider = "getGLs")
     public void testGLs(GetGLsTest cfg) {
-
-        final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
         final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
         double[] priors = new double[len];  // flat priors
 
-        GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
+        final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, cfg.ploidy);
+        calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors);
         int nameIndex = 1;
         for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
             int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
-            int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
-
-//            System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
+            int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele];
             Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
         }
     }
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java
new file mode 100644
index 000000000..391c99990
--- /dev/null
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalcUnitTest.java
@@ -0,0 +1,176 @@
+package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
+
+import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.Utils;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
+import org.broadinstitute.sting.utils.variantcontext.Genotype;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
+import org.testng.Assert;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import java.util.*;
+
+
+// SEE  private/R/pls.R if you want the truth output for these tests
+public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
+    @DataProvider(name = "TestCombineGLs")
+    public Object[][] makeTestCombineGLs() {
+        List tests = new ArrayList();
+
+        tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)});
+        tests.add(new Object[]{1, 1, makePL(10,  0, 20), makePL(10,  0, 20)});
+        tests.add(new Object[]{1, 1, makePL(20, 10,  0), makePL(20, 10,  0)});
+
+        // AA AB BB AC BC CC => AA AB+BC CC
+        tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)});
+        tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)});
+
+        tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
+        tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
+
+        tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)});
+        tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)});
+
+        tests.add(new Object[]{1, 2, makePL(  0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
+        tests.add(new Object[]{2, 2, makePL(  0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
+
+        tests.add(new Object[]{1, 2, makePL( 50,  0, 50, 50, 50, 50), makePL(45, 0, 50)});
+        tests.add(new Object[]{2, 2, makePL( 50,  0, 50, 50, 50, 50), makePL( 0, 47, 50)});
+
+        tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47,  0)});
+        tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)});
+
+        tests.add(new Object[]{1, 2, makePL( 50, 50, 50,  0, 50, 50), makePL(0, 47, 50)});
+        tests.add(new Object[]{2, 2, makePL( 50, 50, 50,  0, 50, 50), makePL(45, 0, 50)});
+
+        tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
+        tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
+
+        tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50,  0), makePL(0, 47, 50)});
+        tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50,  0), makePL(45, 47, 0)});
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    private Genotype makePL(final int ... PLs) {
+        return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs);
+    }
+
+    @Test(enabled = true, dataProvider = "TestCombineGLs")
+    private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) {
+        final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
+        final Genotype combined = calc.combineGLs(testg, altIndex, nAlts);
+
+        Assert.assertEquals(combined.getPL(), expected.getPL(),
+                "Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL()));
+    }
+
+
+    static Allele A = Allele.create("A", true);
+    static Allele C = Allele.create("C");
+    static Allele G = Allele.create("G");
+
+    @DataProvider(name = "TestMakeAlleleConditionalContexts")
+    public Object[][] makeTestMakeAlleleConditionalContexts() {
+        List tests = new ArrayList();
+
+        final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A));
+        final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C));
+        final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G));
+        final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G));
+        final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C));
+
+        final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5);
+        final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2);
+        final Genotype gACcombined = makePL(0, 2, 5);
+        final Genotype gACcombined2 = makePL(0, 1, 4);
+        final Genotype gAGcombined = makePL(0, 4, 9);
+
+        // biallelic
+        tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())});
+
+        // tri-allelic
+        tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGcombined).make())});
+        tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACcombined2).make())});
+
+        return tests.toArray(new Object[][]{});
+    }
+
+
+    @Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts")
+    private void testMakeAlleleConditionalContexts(final VariantContext vc, final List expectedVCs) {
+        final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
+        final List biAllelicVCs = calc.makeAlleleConditionalContexts(vc);
+
+        Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size());
+
+        for ( int i = 0; i < biAllelicVCs.size(); i++ ) {
+            final VariantContext actual = biAllelicVCs.get(i);
+            final VariantContext expected = expectedVCs.get(i);
+            Assert.assertEquals(actual.getAlleles(), expected.getAlleles());
+
+            for ( int j = 0; j < actual.getNSamples(); j++ )
+                Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL(),
+                        "expected PLs " + Utils.join(",", expected.getGenotype(j).getPL()) + " not equal to actual " + Utils.join(",", actual.getGenotype(j).getPL()));
+        }
+    }
+
+
+    @DataProvider(name = "ThetaNTests")
+    public Object[][] makeThetaNTests() {
+        List tests = new ArrayList();
+
+        final List log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0);
+
+        for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) {
+            for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) {
+                for ( List permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) {
+                    tests.add(new Object[]{permutations, Math.pow(10, log10pRef)});
+                }
+            }
+        }
+
+        return tests.toArray(new Object[][]{});
+    }
+
+    @Test(dataProvider = "ThetaNTests")
+    public void testThetaNTests(final List log10LAlleles, final double pRef) {
+        // biallelic
+        final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef});
+
+        final double log10pNonRef = Math.log10(1-pRef);
+
+        final List originalPriors = new LinkedList();
+        final List pNonRefN = new LinkedList();
+        for ( int i = 0; i < log10LAlleles.size(); i++ ) {
+            final double log10LAllele1 = log10LAlleles.get(i);
+            final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true);
+            final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0));
+            originalPriors.add(result1);
+            pNonRefN.add(log10pNonRef*(i+1));
+        }
+
+        final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2);
+        final List thetaNPriors = calc.applyMultiAllelicPriors(originalPriors);
+
+        double prevPosterior = 0.0;
+        for ( int i = 0; i < log10LAlleles.size(); i++ ) {
+            final AFCalcResult thetaN = thetaNPriors.get(i);
+            AFCalcResult orig = null;
+            for ( final AFCalcResult x : originalPriors )
+                if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping()))
+                    orig = x;
+
+            Assert.assertNotNull(orig, "couldn't find original AFCalc");
+
+            Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6);
+            Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6);
+
+            Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0());
+            prevPosterior = orig.getLog10PosteriorOfAFGT0();
+        }
+    }
+}
diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
index c766f363c..86f3748ce 100644
--- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
+++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java
@@ -8,9 +8,10 @@ import java.util.Arrays;
 public class HaplotypeCallerIntegrationTest extends WalkerTest {
     final static String REF = b37KGReference;
     final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
+    final String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
     final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
+    final String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
     final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
-    //final String RECAL_FILE = validationDataLocation + "NA12878.kmer.8.subset.recal_data.bqsr";
 
     private void HCTest(String bam, String args, String md5) {
         final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
@@ -20,28 +21,77 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
 
     @Test
     public void testHaplotypeCallerMultiSample() {
-        HCTest(CEUTRIO_BAM, "", "6b30c7e1b6bbe80d180d9d67441cec12");
+        HCTest(CEUTRIO_BAM, "", "aa1df35d6e64d7ca93feb4d2dd15dd0e");
     }
 
     @Test
     public void testHaplotypeCallerSingleSample() {
-        HCTest(NA12878_BAM, "", "4cdfbfeadef00725974828310558d7d4");
+        HCTest(NA12878_BAM, "", "186c7f322978283c01249c6de2829215");
     }
 
     @Test
     public void testHaplotypeCallerMultiSampleGGA() {
-        HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "6183fb6e374976d7087150009685e043");
+        HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "de9e78a52207fe62144dba5337965469");
     }
 
     private void HCTestComplexVariants(String bam, String args, String md5) {
-        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 3";
+        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 2";
         final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
         executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
     }
 
     @Test
     public void testHaplotypeCallerMultiSampleComplex() {
-        HCTestComplexVariants(CEUTRIO_BAM, "", "ab7593a7a60a2e9a66053572f1718df1");
+        HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "000dbb1b48f94d017cfec127c6cabe8f");
+    }
+
+    private void HCTestSymbolicVariants(String bam, String args, String md5) {
+        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 2";
+        final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
+        executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
+    }
+
+    @Test
+    public void testHaplotypeCallerSingleSampleSymbolic() {
+        HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "16013a9203367c3d1c4ce1dcdc81ef4a");
+    }
+
+    private void HCTestIndelQualityScores(String bam, String args, String md5) {
+        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
+        final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
+        executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec);
+    }
+
+    @Test
+    public void testHaplotypeCallerSingleSampleIndelQualityScores() {
+        HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "b369c2a6cb5c99a424551b33bae16f3b");
+    }
+
+    @Test
+    public void HCTestProblematicReadsModifiedInActiveRegions() {
+        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c306140ad28515ee06c603c225217939"));
+        executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
+    }
+
+    @Test
+    public void HCTestStructuralIndels() {
+        final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
+        final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6c67ee8e99cc8f53a6587bb26028047"));
+        executeTest("HCTestStructuralIndels: ", spec);
+    }
+
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // testing reduced reads
+    //
+    // --------------------------------------------------------------------------------------------------------------
+
+    @Test
+    public void HCTestReducedBam() {
+        WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
+                "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
+                Arrays.asList("4beb9f87ab3f316a9384c3d0dca6ebe9"));
+        executeTest("HC calling on a ReducedRead BAM", spec);
     }
 }
-
diff --git a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java
similarity index 56%
rename from public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java
rename to protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java
index 22bcb1bbf..6281054b1 100644
--- a/public/java/test/org/broadinstitute/sting/utils/PairHMMUnitTest.java
+++ b/protected/java/test/org/broadinstitute/sting/utils/pairhmm/PairHMMUnitTest.java
@@ -23,24 +23,26 @@
  */
 
 // our package
-package org.broadinstitute.sting.utils;
+package org.broadinstitute.sting.utils.pairhmm;
 
 
 // the imports for unit testing.
 
-
 import org.broadinstitute.sting.BaseTest;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.Utils;
 import org.testng.Assert;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
 import java.util.*;
 
-
 public class PairHMMUnitTest extends BaseTest {
     final static boolean EXTENSIVE_TESTING = true;
-    PairHMM hmm = new PairHMM( false ); // reference implementation
-    PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding
+    PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation
+    PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation
+    PairHMM cachingHMM = new CachingPairHMM();
+    PairHMM loglessHMM = new LoglessCachingPairHMM();
 
     // --------------------------------------------------------------------------------
     //
@@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest {
         final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC";
         final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA";
 
-        public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
+        public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) {
             this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
         }
 
@@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest {
         }
 
         public double expectedLogL() {
-            return expectedQual / -10.0;
+            return (expectedQual / -10.0) + 0.03 ;
         }
 
-        public double tolerance() {
-            return 0.1; // TODO FIXME arbitrary
+        public double toleranceFromTheoretical() {
+            return 0.2;
         }
 
-        public double calcLogL() {
+        public double toleranceFromReference() {
+            return 1E-4;
+        }
 
-            double logL = hmm.computeReadLikelihoodGivenHaplotype(
+        public double toleranceFromExact() {
+            return 1E-9;
+        }
+
+        public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) {
+            pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length);
+            return pairHMM.computeReadLikelihoodGivenHaplotypeLog10(
                     refBasesWithContext, readBasesWithContext,
-                    qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true),
-                    qualAsBytes(gcp, false));
-
-            return logL;
+                    qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel),
+                    qualAsBytes(gcp, false, anchorIndel), 0, true);
         }
 
         private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
             return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
         }
 
-        private byte[] qualAsBytes(final int phredQual, final boolean doGOP) {
+        private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) {
             final byte phredQuals[] = new byte[readBasesWithContext.length];
-            // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
-            Arrays.fill(phredQuals, (byte)100);
 
-            // update just the bases corresponding to the provided micro read with the quality scores
-            if( doGOP ) {
-                phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
-            } else {
-                for ( int i = 0; i < read.length(); i++)
-                    phredQuals[i + CONTEXT.length()] = (byte)phredQual;
-            }
+            if( anchorIndel ) {
+                // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
+                Arrays.fill(phredQuals, (byte)100);
 
-            return phredQuals;
-        }
-    }
-
-    final Random random = new Random(87865573);
-    private class BandedLikelihoodTestProvider extends TestDataProvider {
-        final String ref, read;
-        final byte[] refBasesWithContext, readBasesWithContext;
-        final int baseQual, insQual, delQual, gcp;
-        final int expectedQual;
-        final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC";
-        final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA";
-        final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT";
-        final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA";
-        final byte[] baseQuals, insQuals, delQuals, gcps;
-
-        public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
-            this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
-        }
-
-        public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) {
-            super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual));
-            this.baseQual = baseQual;
-            this.delQual = delQual;
-            this.insQual = insQual;
-            this.gcp = gcp;
-            this.read = read;
-            this.ref = ref;
-            this.expectedQual = expectedQual;
-
-            refBasesWithContext = asBytes(ref, left, right);
-            readBasesWithContext = asBytes(read, false, false);
-            baseQuals = qualAsBytes(baseQual);
-            insQuals = qualAsBytes(insQual);
-            delQuals = qualAsBytes(delQual);
-            gcps = qualAsBytes(gcp, false);
-        }
-
-        public double expectedLogL() {
-            double logL = hmm.computeReadLikelihoodGivenHaplotype(
-                    refBasesWithContext, readBasesWithContext,
-                    baseQuals, insQuals, delQuals, gcps);
-
-            return logL;
-        }
-
-        public double tolerance() {
-            return 0.2; // TODO FIXME arbitrary
-        }
-
-        public double calcLogL() {
-
-            double logL = bandedHMM.computeReadLikelihoodGivenHaplotype(
-                    refBasesWithContext, readBasesWithContext,
-                    baseQuals, insQuals, delQuals, gcps);
-
-            return logL;
-        }
-
-        private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
-            return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
-        }
-
-        private byte[] qualAsBytes(final int phredQual) {
-            return qualAsBytes(phredQual, true);
-        }
-
-        private byte[] qualAsBytes(final int phredQual, final boolean addRandom) {
-            final byte phredQuals[] = new byte[readBasesWithContext.length];
-            Arrays.fill(phredQuals, (byte)phredQual);
-            if(addRandom) {
-                for( int iii = 0; iii < phredQuals.length; iii++) {
-                    phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3));
+                // update just the bases corresponding to the provided micro read with the quality scores
+                if( doGOP ) {
+                    phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
+                } else {
+                    for ( int i = 0; i < read.length(); i++)
+                        phredQuals[i + CONTEXT.length()] = (byte)phredQual;
                 }
+            } else {
+                Arrays.fill(phredQuals, (byte)phredQual);
             }
+
             return phredQuals;
         }
     }
@@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest {
         // test all combinations
         final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30);
         final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40);
-        final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
-        final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
+        final List gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10);
+        final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2);
 
         for ( final int baseQual : baseQuals ) {
             for ( final int indelQual : indelQuals ) {
@@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest {
 
                             for ( boolean insertionP : Arrays.asList(true, false)) {
                                 final String small = Utils.dupString((char)base, 1);
-                                final String big = Utils.dupString((char)base, size);
+                                final String big = Utils.dupString((char) base, size);
 
                                 final String ref = insertionP ? small : big;
                                 final String read = insertionP ? big : small;
@@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest {
         return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
     }
 
-    @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
-    public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
-        double calculatedLogL = cfg.calcLogL();
-        double expectedLogL = cfg.expectedLogL();
-        logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
-        Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
-    }
-
-    @DataProvider(name = "BandedLikelihoodTestProvider")
-    public Object[][] makeBandedLikelihoodTests() {
+    final Random random = new Random(87860573);
+    @DataProvider(name = "OptimizedLikelihoodTestProvider")
+    public Object[][] makeOptimizedLikelihoodTests() {
         // context on either side is ACGTTGCA REF ACGTTGCA
         // test all combinations
-        final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30);
-        final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40);
-        final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10);
-        final List sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2);
+        final List baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30);
+        final List indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40);
+        final List gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
+        final List sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2);
 
         for ( final int baseQual : baseQuals ) {
             for ( final int indelQual : indelQuals ) {
                 for ( final int gcp : gcps ) {
-
-                    // test substitutions
-                    for ( final byte refBase : BaseUtils.BASES ) {
-                        for ( final byte readBase : BaseUtils.BASES ) {
-                            final String ref  = new String(new byte[]{refBase});
-                            final String read = new String(new byte[]{readBase});
-                            final int expected = refBase == readBase ? 0 : baseQual;
-                            new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
-                        }
-                    }
-
-                    // test insertions and deletions
-                    for ( final int size : sizes ) {
-                        for ( final byte base : BaseUtils.BASES ) {
-                            final int expected = indelQual + (size - 2) * gcp;
-
-                            for ( boolean insertionP : Arrays.asList(true, false)) {
-                                final String small = Utils.dupString((char)base, 1);
-                                final String big = Utils.dupString((char)base, size);
-
-                                final String ref = insertionP ? small : big;
-                                final String read = insertionP ? big : small;
-
-                                new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
-                                new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false);
-                                new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true);
-                                new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true);
+                    for ( final int refSize : sizes ) {
+                        for ( final int readSize : sizes ) {
+                            String ref = "";
+                            String read = "";
+                            for( int iii = 0; iii < refSize; iii++) {
+                                ref += (char) BaseUtils.BASES[random.nextInt(4)];
                             }
+                            for( int iii = 0; iii < readSize; iii++) {
+                                read += (char) BaseUtils.BASES[random.nextInt(4)];
+                            }
+                            new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp);
+                            new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false);
+                            new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true);
+                            new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true);
                         }
                     }
                 }
             }
         }
 
-        return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class);
+        return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
     }
 
-    @Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true)
-    public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) {
-        double calculatedLogL = cfg.calcLogL();
+    @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
+    public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
+        double exactLogL = cfg.calcLogL( exactHMM, true );
+        double calculatedLogL = cfg.calcLogL( originalHMM, true );
+        double optimizedLogL = cfg.calcLogL( cachingHMM, true );
+        double loglessLogL = cfg.calcLogL( loglessHMM, true );
         double expectedLogL = cfg.expectedLogL();
-        logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
-        Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
+        //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
+        Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical());
+        Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical());
+        Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
+        Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
+    }
+
+    @Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true)
+    public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) {
+        double exactLogL = cfg.calcLogL( exactHMM, false );
+        double calculatedLogL = cfg.calcLogL( originalHMM, false );
+        double optimizedLogL = cfg.calcLogL( cachingHMM, false );
+        double loglessLogL = cfg.calcLogL( loglessHMM, false );
+        //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
+        Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
+        Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
     }
 
     @Test
@@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest {
             byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
             // change single base at position k to C. If it's a C, change to T
             mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
-            double res1 = hmm.computeReadLikelihoodGivenHaplotype(
+            originalHMM.initialize(mread.length, haplotype1.length);
+            double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
                     haplotype1, mread,
                     quals, gop, gop,
-                    gcp);
-
+                    gcp, 0, false);
 
             System.out.format("H:%s\nR:  %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
 
@@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest {
             byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
             // change single base at position k to C. If it's a C, change to T
             mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
-            double res1 = hmm.computeReadLikelihoodGivenHaplotype(
+            originalHMM.initialize(mread.length, haplotype1.length);
+            double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
                     haplotype1, mread,
                     quals, gop, gop,
-                    gcp);
-
+                    gcp, 0, false);
 
             System.out.format("H:%s\nR:  %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
 
diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
index 4c228ccb4..eba94c0cb 100644
--- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
+++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R
@@ -111,7 +111,13 @@ gsa.read.gatkreportv1 <- function(lines) {
   headerRowCount = -1;
   
   finishTable <- function() {
-    .gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows[1:rowCount,], tableEnv);
+    if ( rowCount == 1 )
+      # good I hate R.  Work around to avoid collapsing into an unstructured vector when 
+      # there's only 1 row
+      sub <- t(as.matrix(tableRows[1:rowCount,]))
+    else
+      sub <- tableRows[1:rowCount,]
+    .gsa.assignGATKTableToEnvironment(tableName, tableHeader, sub, tableEnv);
   }
   
   for (line in lines) {
diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
index 45dacd835..748f00e28 100644
--- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
+++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.variantqc.utils.R
@@ -1,5 +1,6 @@
 library(gplots)
 library(ggplot2)
+library(tools)
 
 # -------------------------------------------------------
 # Utilities for displaying multiple plots per page
@@ -59,6 +60,7 @@ closePDF <- function(outputPDF) {
   if ( ! is.na(outputPDF) ) {
     dev.off()
     if (exists("compactPDF")) {
+      print("compacting PDF")
       compactPDF(outputPDF)
     }
   }
diff --git a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java
index 10326ef2e..507d4b786 100644
--- a/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java
+++ b/public/java/src/net/sf/picard/reference/FastaSequenceIndexBuilder.java
@@ -245,7 +245,7 @@ public class FastaSequenceIndexBuilder {
      * Reset iterators and add contig to sequence index
      */
     private void finishReadingContig(FastaSequenceIndex sequenceIndex) {
-        sequenceIndex.add(new FastaSequenceIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
+        sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
         status = Status.NONE;
         contig = "";
         size = 0;
@@ -258,6 +258,14 @@ public class FastaSequenceIndexBuilder {
         }
     }
 
+    /*
+     * Trims the contig name to the expected value by removing any characters after the first whitespace
+     */
+    private static String trimContigName(final String contigName) {
+        int whitespaceIndex = contigName.indexOf(' ');
+        return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex);
+    }
+
     /**
      * Stores FastaSequenceIndex as a .fasta.fai file on local machine
      * Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder
diff --git a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java
index ffc40067a..665b098e5 100644
--- a/public/java/src/net/sf/samtools/GATKBAMFileSpan.java
+++ b/public/java/src/net/sf/samtools/GATKBAMFileSpan.java
@@ -125,6 +125,37 @@ public class GATKBAMFileSpan extends BAMFileSpan {
         return size;
     }
 
+    /**
+     * Get a GATKChunk representing the "extent" of this file span, from the start of the first
+     * chunk to the end of the last chunk.The chunks list must be sorted in order to use this method.
+     *
+     * @return a GATKChunk representing the extent of this file span, or a GATKChunk representing
+     *         a span of size 0 if there are no chunks
+     */
+    public GATKChunk getExtent() {
+        validateSorted();   // TODO: defensive measure: may be unnecessary
+
+        List chunks = getChunks();
+        if ( chunks.isEmpty() ) {
+            return new GATKChunk(0L, 0L);
+        }
+
+        return new GATKChunk(chunks.get(0).getChunkStart(), chunks.get(chunks.size() - 1).getChunkEnd());
+    }
+
+    /**
+     * Validates the list of chunks to ensure that they appear in sorted order.
+     */
+    private void validateSorted() {
+        List chunks = getChunks();
+        for ( int i = 1; i < chunks.size(); i++ ) {
+            if ( chunks.get(i).getChunkStart() < chunks.get(i-1).getChunkEnd() ) {
+                throw new ReviewedStingException(String.format("Chunk list is unsorted; chunk %s is before chunk %s", chunks.get(i-1), chunks.get(i)));
+
+            }
+        }
+    }
+
     /**
      * Computes the union of two FileSpans.
      * @param other FileSpan to union with this one.
diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java
index e8eea5ff0..b903b9f7d 100644
--- a/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java
+++ b/public/java/src/org/broadinstitute/sting/alignment/AlignmentValidation.java
@@ -31,7 +31,7 @@ import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
 import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.gatk.CommandLineGATK;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.walkers.ReadWalker;
 import org.broadinstitute.sting.utils.BaseUtils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@@ -81,7 +81,7 @@ public class AlignmentValidation extends ReadWalker {
      * @return Number of reads aligned by this map (aka 1).
      */
     @Override
-    public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
+    public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
         //logger.info(String.format("examining read %s", read.getReadName()));
 
         byte[] bases = read.getReadBases();
diff --git a/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java b/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java
deleted file mode 100644
index 6206fc2ce..000000000
--- a/public/java/src/org/broadinstitute/sting/alignment/AlignmentWalker.java
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.alignment;
-
-import net.sf.picard.reference.ReferenceSequenceFileFactory;
-import net.sf.samtools.SAMFileHeader;
-import net.sf.samtools.SAMRecord;
-import net.sf.samtools.SAMSequenceDictionary;
-import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
-import org.broadinstitute.sting.alignment.bwa.BWTFiles;
-import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.gatk.CommandLineGATK;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
-import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.ReadWalker;
-import org.broadinstitute.sting.gatk.walkers.WalkerName;
-import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-import java.io.File;
-
-/**
- * Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format.
- * Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation.
- *
- * @author mhanna
- * @version 0.1
- */
-@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
-@WalkerName("Align")
-public class AlignmentWalker extends ReadWalker {
-    @Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned.  Alongside this reference should sit index files " +
-                                                                     "generated by bwa index -d bwtsw.  If unspecified, will default " +
-                                                                     "to the reference specified via the -R argument.",required=false)
-    private File targetReferenceFile = null;
-
-    @Output
-    private StingSAMFileWriter out = null;
-
-    /**
-     * The actual aligner.
-     */
-    private BWACAligner aligner = null;
-
-    /**
-     * New header to use, if desired.
-     */
-    private SAMFileHeader header;
-
-    /**
-     * Create an aligner object.  The aligner object will load and hold the BWT until close() is called.
-     */    
-    @Override
-    public void initialize() {
-        if(targetReferenceFile == null)
-            targetReferenceFile = getToolkit().getArguments().referenceFile;
-        BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
-        BWAConfiguration configuration = new BWAConfiguration();
-        aligner = new BWACAligner(bwtFiles,configuration);
-
-        // Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
-        header = getToolkit().getSAMFileHeader().clone();
-        SAMSequenceDictionary referenceDictionary =
-                ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
-        header.setSequenceDictionary(referenceDictionary);
-        header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
-
-        out.writeHeader(header);
-    }
-
-    /**
-     * Aligns a read to the given reference.
-     *
-     * @param ref Reference over the read.  Read will most likely be unmapped, so ref will be null.
-     * @param read Read to align.
-     * @return Number of alignments found for this read.
-     */
-    @Override
-    public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
-        SAMRecord alignedRead = aligner.align(read,header);
-        out.addAlignment(alignedRead);
-        return 1;
-    }
-
-    /**
-     * Initial value for reduce.  In this case, alignments will be counted.
-     * @return 0, indicating no alignments yet found.
-     */
-    @Override
-    public Integer reduceInit() { return 0; }
-
-    /**
-     * Calculates the number of alignments found.
-     * @param value Number of alignments found by this map.
-     * @param sum Number of alignments found before this map.
-     * @return Number of alignments found up to and including this map.
-     */    
-    @Override
-    public Integer reduce(Integer value, Integer sum) {
-        return value + sum;
-    }
-
-    /**
-     * Cleanup.
-     * @param result Number of reads processed.
-     */    
-    @Override
-    public void onTraversalDone(Integer result) {
-        aligner.close();
-        super.onTraversalDone(result);
-    }
-
-}
diff --git a/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignments.java b/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignments.java
deleted file mode 100644
index 336c95d42..000000000
--- a/public/java/src/org/broadinstitute/sting/alignment/CountBestAlignments.java
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2010 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
- * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.alignment;
-
-import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
-import org.broadinstitute.sting.alignment.bwa.BWTFiles;
-import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Output;
-import org.broadinstitute.sting.gatk.CommandLineGATK;
-import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
-import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
-import org.broadinstitute.sting.gatk.walkers.ReadWalker;
-import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
-import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
-
-import java.io.PrintStream;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.SortedMap;
-import java.util.TreeMap;
-
-/**
- * Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the
- * frequency of that number of placements.
- *
- * @author mhanna
- * @version 0.1
- */
-@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
-public class CountBestAlignments extends ReadWalker {
-    /**
-     * The supporting BWT index generated using BWT.
-     */
-    @Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
-    private String prefix = null;
-
-    @Output
-    private PrintStream out = null;    
-
-    /**
-     * The actual aligner.
-     */
-    private Aligner aligner = null;
-
-    private SortedMap alignmentFrequencies = new TreeMap();
-
-    /**
-     * Create an aligner object.  The aligner object will load and hold the BWT until close() is called.
-     */
-    @Override
-    public void initialize() {
-        if(prefix == null)
-            prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();        
-        BWTFiles bwtFiles = new BWTFiles(prefix);
-        BWAConfiguration configuration = new BWAConfiguration();
-        aligner = new BWACAligner(bwtFiles,configuration);
-    }
-
-    /**
-     * Aligns a read to the given reference.
-     *
-     * @param ref Reference over the read.  Read will most likely be unmapped, so ref will be null.
-     * @param read Read to align.
-     * @return Number of alignments found for this read.
-     */
-    @Override
-    public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
-        Iterator alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
-        if(alignmentIterator.hasNext()) {
-            int numAlignments = alignmentIterator.next().length;
-            if(alignmentFrequencies.containsKey(numAlignments))
-                alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
-            else
-                alignmentFrequencies.put(numAlignments,1);
-        }
-        return 1;
-    }    
-
-    /**
-     * Initial value for reduce.  In this case, validated reads will be counted.
-     * @return 0, indicating no reads yet validated.
-     */
-    @Override
-    public Integer reduceInit() { return 0; }
-
-    /**
-     * Calculates the number of reads processed.
-     * @param value Number of reads processed by this map.
-     * @param sum Number of reads processed before this map.
-     * @return Number of reads processed up to and including this map.
-     */
-    @Override
-    public Integer reduce(Integer value, Integer sum) {
-        return value + sum;
-    }
-
-    /**
-     * Cleanup.
-     * @param result Number of reads processed.
-     */
-    @Override
-    public void onTraversalDone(Integer result) {
-        aligner.close();
-        for(Map.Entry alignmentFrequency: alignmentFrequencies.entrySet())
-            out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
-        super.onTraversalDone(result);
-    }
-}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/Argument.java b/public/java/src/org/broadinstitute/sting/commandline/Argument.java
index 33592287d..67ce8a863 100755
--- a/public/java/src/org/broadinstitute/sting/commandline/Argument.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/Argument.java
@@ -62,7 +62,7 @@ public @interface Argument {
      * --help argument is specified. 
      * @return Doc string associated with this command-line argument.
      */
-    String doc();
+    String doc() default "Undocumented option";
 
     /**
      * Is this argument required.  If true, the command-line argument system will
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java
index c0823e5c5..6c8fb1f4d 100755
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatch.java
@@ -46,7 +46,7 @@ public class ArgumentMatch implements Iterable {
     /**
      * Maps indices of command line arguments to values paired with that argument.
      */
-    public final SortedMap> sites = new TreeMap>();
+    public final SortedMap> sites = new TreeMap>();
 
     /**
      * An ordered, freeform collection of tags.
@@ -90,11 +90,11 @@ public class ArgumentMatch implements Iterable {
      * @param value Value for the argument at this position.
      * @param tags ordered freeform text tags associated with this argument.
      */
-    private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final String value, final Tags tags) {
+    private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final ArgumentMatchValue value, final Tags tags) {
         this.label = label;
         this.definition = definition;
 
-        ArrayList values = new ArrayList();
+        ArrayList values = new ArrayList();
         if( value != null )
             values.add(value);
         sites.put(site,values );
@@ -131,11 +131,11 @@ public class ArgumentMatch implements Iterable {
      */
     @SuppressWarnings("unchecked")
     ArgumentMatch transform(Multiplexer multiplexer, Object key) {
-        SortedMap> newIndices = new TreeMap>();
-        for(Map.Entry> site: sites.entrySet()) {
-            List newEntries = new ArrayList();
-            for(String entry: site.getValue())
-                newEntries.add(multiplexer.transformArgument(key,entry));
+        SortedMap> newIndices = new TreeMap>();
+        for(Map.Entry> site: sites.entrySet()) {
+            List newEntries = new ArrayList();
+            for(ArgumentMatchValue entry: site.getValue())
+                newEntries.add(new ArgumentMatchStringValue(multiplexer.transformArgument(key,entry.asString())));
             newIndices.put(site.getKey(),newEntries);
         }
         ArgumentMatch newArgumentMatch = new ArgumentMatch(label,definition);
@@ -165,7 +165,7 @@ public class ArgumentMatch implements Iterable {
             /**
              * Iterate over each available token.
              */
-            private Iterator tokenIterator = null;
+            private Iterator tokenIterator = null;
 
             /**
              * The next site to return.  Null if none remain.
@@ -175,7 +175,7 @@ public class ArgumentMatch implements Iterable {
             /**
              * The next token to return.  Null if none remain.
              */
-            String nextToken = null;
+            ArgumentMatchValue nextToken = null;
 
             {
                 siteIterator = sites.keySet().iterator();
@@ -254,9 +254,9 @@ public class ArgumentMatch implements Iterable {
      * @param site site of the command-line argument to which this value is mated.
      * @param value Text representation of value to add.
      */
-    public void addValue( ArgumentMatchSite site, String value ) {
+    public void addValue( ArgumentMatchSite site, ArgumentMatchValue value ) {
         if( !sites.containsKey(site) || sites.get(site) == null )
-            sites.put(site, new ArrayList() );
+            sites.put(site, new ArrayList() );
         sites.get(site).add(value);
     }
 
@@ -275,8 +275,8 @@ public class ArgumentMatch implements Iterable {
      * Return the values associated with this argument match.
      * @return A collection of the string representation of these value.
      */
-    public List values() {
-        List values = new ArrayList();
+    public List values() {
+        List values = new ArrayList();
         for( ArgumentMatchSite site: sites.keySet() ) {
             if( sites.get(site) != null )
                 values.addAll(sites.get(site));
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java
new file mode 100644
index 000000000..344b6829a
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchFileValue.java
@@ -0,0 +1,27 @@
+package org.broadinstitute.sting.commandline;
+
+import java.io.File;
+
+/**
+ * Holds a reference to a file as an argument match value.
+ *
+ * This is useful when the type of the stored file may be a subclass of java.io.File,
+ * for example a Queue RemoteFile.
+ */
+public class ArgumentMatchFileValue extends ArgumentMatchValue {
+    private final File file;
+
+    public ArgumentMatchFileValue(File file) {
+        this.file = file;
+    }
+
+    @Override
+    public String asString() {
+        return file == null ? null : file.getAbsolutePath();
+    }
+
+    @Override
+    public File asFile() {
+        return file;
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java
index ed2700006..9dfb3afbe 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSource.java
@@ -24,38 +24,36 @@
 
 package org.broadinstitute.sting.commandline;
 
-import java.io.File;
-
 /**
- * Where an argument match originated, via the commandline or a file.
+ * Where an argument match originated, via the commandline or a custom provider.
  */
 public class ArgumentMatchSource implements Comparable {
     public static final ArgumentMatchSource COMMAND_LINE = new ArgumentMatchSource(ArgumentMatchSourceType.CommandLine, null);
 
     private final ArgumentMatchSourceType type;
-    private final File file;
+    private final String description;
 
     /**
      * Creates an argument match source from the specified file.
-     * @param file File specifying the arguments. Must not be null.
+     * @param description Where the arguments originated.
      */
-    public ArgumentMatchSource(File file) {
-        this(ArgumentMatchSourceType.File, file);
+    public ArgumentMatchSource(String description) {
+        this(ArgumentMatchSourceType.Provider, description);
     }
 
-    private ArgumentMatchSource(ArgumentMatchSourceType type, File file) {
-        if (type == ArgumentMatchSourceType.File && file == null)
-            throw new IllegalArgumentException("An argument match source of type File cannot have a null file.");
+    private ArgumentMatchSource(ArgumentMatchSourceType type, String description) {
+        if (type == ArgumentMatchSourceType.Provider && description == null)
+            throw new IllegalArgumentException("An argument match source provider cannot have a null description.");
         this.type = type;
-        this.file = file;
+        this.description = description;
     }
 
     public ArgumentMatchSourceType getType() {
         return type;
     }
 
-    public File getFile() {
-        return file;
+    public String getDescription() {
+        return description;
     }
 
     @Override
@@ -65,13 +63,13 @@ public class ArgumentMatchSource implements Comparable {
 
         ArgumentMatchSource that = (ArgumentMatchSource) o;
 
-        return (type == that.type) && (file == null ? that.file == null : file.equals(that.file));
+        return (type == that.type) && (description == null ? that.description == null : description.equals(that.description));
     }
 
     @Override
     public int hashCode() {
         int result = type != null ? type.hashCode() : 0;
-        result = 31 * result + (file != null ? file.hashCode() : 0);
+        result = 31 * result + (description != null ? description.hashCode() : 0);
         return result;
     }
 
@@ -84,15 +82,15 @@ public class ArgumentMatchSource implements Comparable {
         if (comp != 0)
             return comp;
 
-        File f1 = this.file;
-        File f2 = that.file;
+        String d1 = this.description;
+        String d2 = that.description;
 
-        if ((f1 == null) ^ (f2 == null)) {
-            // If one of the files is null and the other is not
-            // put the null file first
-            return f1 == null ? -1 : 1;
+        if ((d1 == null) ^ (d2 == null)) {
+            // If one of the descriptions is null and the other is not
+            // put the null description first
+            return d1 == null ? -1 : 1;
         }
 
-        return f1 == null ? 0 : f1.compareTo(f2);
+        return d1 == null ? 0 : d1.compareTo(d2);
     }
 }
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java
index 3ff6e21d4..118316473 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchSourceType.java
@@ -25,8 +25,8 @@
 package org.broadinstitute.sting.commandline;
 
 /**
- * Type of where an argument match originated, via the commandline or a file.
+ * Type of where an argument match originated, via the commandline or a some other provider.
  */
 public enum ArgumentMatchSourceType {
-    CommandLine, File
+    CommandLine, Provider
 }
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java
new file mode 100644
index 000000000..bb2015c3b
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchStringValue.java
@@ -0,0 +1,24 @@
+package org.broadinstitute.sting.commandline;
+
+import java.io.File;
+
+/**
+ * Argument values that originated from a string.
+ */
+public class ArgumentMatchStringValue extends ArgumentMatchValue {
+    private final String value;
+
+    public ArgumentMatchStringValue(String value) {
+        this.value = value;
+    }
+
+    @Override
+    public String asString() {
+        return value;
+    }
+
+    @Override
+    public File asFile() {
+        return value == null ? null : new File(value);
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchValue.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchValue.java
new file mode 100644
index 000000000..bed4edfa6
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentMatchValue.java
@@ -0,0 +1,18 @@
+package org.broadinstitute.sting.commandline;
+
+import java.io.File;
+
+/**
+ * Returns argument values as either strings or values.
+ */
+public abstract class ArgumentMatchValue {
+    /**
+     * @return the value of this argument as a String object.
+     */
+    public abstract String asString();
+
+    /**
+     * @return the value of this argument as a File object.
+     */
+    public abstract File asFile();
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
index dd4a151bf..5d7eba16c 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@@ -215,8 +215,8 @@ public abstract class ArgumentTypeDescriptor {
      * @param matches The matches for the given argument.
      * @return The value of the argument if available, or null if not present.
      */
-    protected String getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) {
-        Collection argumentValues = getArgumentValues( definition, matches );
+    protected ArgumentMatchValue getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) {
+        Collection argumentValues = getArgumentValues( definition, matches );
         if( argumentValues.size() > 1 )
             throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName);
         return argumentValues.size() > 0 ? argumentValues.iterator().next() : null;
@@ -244,8 +244,8 @@ public abstract class ArgumentTypeDescriptor {
      * @param matches The matches for the given argument.
      * @return The value of the argument if available, or an empty collection if not present.
      */
-    protected Collection getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) {
-        Collection values = new ArrayList();
+    protected Collection getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) {
+        Collection values = new ArrayList();
         for( ArgumentMatch match: matches ) {
             if( match.definition.equals(definition) )
                 values.addAll(match.values());
@@ -310,7 +310,7 @@ public abstract class ArgumentTypeDescriptor {
      */
     protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) {
         ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
-        String value = getArgumentValue(defaultDefinition, matches);
+        ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches);
         @SuppressWarnings("unchecked")
         Class parameterType = JVMUtils.getParameterizedTypeClass(type);
         String name = defaultDefinition.fullName;
@@ -328,7 +328,7 @@ public abstract class ArgumentTypeDescriptor {
      * @param fieldName The name of the field that was parsed. Used for error reporting.
      * @return The newly created binding object of type bindingClass.
      */
-    public static Object parseBinding(String value, Class parameterType, Type bindingClass,
+    public static Object parseBinding(ArgumentMatchValue value, Class parameterType, Type bindingClass,
                                       String bindingName, Tags tags, String fieldName) {
         try {
             String tribbleType = null;
@@ -337,7 +337,7 @@ public abstract class ArgumentTypeDescriptor {
                 throw new UserException.CommandLineException(
                         String.format("Unexpected number of positional tags for argument %s : %s. " +
                                 "Rod bindings only support -X:type and -X:name,type argument styles",
-                                value, fieldName));
+                                value.asString(), fieldName));
             } else if ( tags.getPositionalTags().size() == 2 ) {
                 // -X:name,type style
                 bindingName = tags.getPositionalTags().get(0);
@@ -366,7 +366,7 @@ public abstract class ArgumentTypeDescriptor {
 
                 if ( tribbleType == null ) {
                     // try to determine the file type dynamically
-                    File file = new File(value);
+                    File file = value.asFile();
                     if ( file.canRead() && file.isFile() ) {
                         FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file);
                         if ( featureDescriptor != null ) {
@@ -379,7 +379,7 @@ public abstract class ArgumentTypeDescriptor {
                         // IntervalBinding can be created from a normal String
                         Class rawType = (makeRawTypeIfNecessary(bindingClass));
                         try {
-                            return rawType.getConstructor(String.class).newInstance(value);
+                            return rawType.getConstructor(String.class).newInstance(value.asString());
                         } catch (NoSuchMethodException e) {
                             /* ignore */
                         }
@@ -399,7 +399,7 @@ public abstract class ArgumentTypeDescriptor {
             }
 
             Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
-            return ctor.newInstance(parameterType, bindingName, value, tribbleType, tags);
+            return ctor.newInstance(parameterType, bindingName, value.asString(), tribbleType, tags);
         } catch (Exception e) {
             if ( e instanceof UserException )
                 throw ((UserException)e);
@@ -517,7 +517,7 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
             return true;
 
         ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
-        String value = getArgumentValue( defaultDefinition, matches );
+        ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches);
         Object result;
         Tags tags = getArgumentTags(matches);
 
@@ -527,12 +527,12 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
                 Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class);
                 if(value == null)
                     throw new MissingArgumentValueException(createDefaultArgumentDefinition(source));
-                result = valueOf.invoke(null,value.trim());
+                result = valueOf.invoke(null,value.asString().trim());
             } else if (type.isEnum()) {
                 Object[] vals = type.getEnumConstants();
                 Object defaultEnumeration = null;  // as we look at options, record the default option if it exists
                 for (Object val : vals) {
-                    if (String.valueOf(val).equalsIgnoreCase(value)) return val;
+                    if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val;
                     try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; }
                     catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); }
                 }
@@ -544,10 +544,12 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
                 else if (value == null)
                     throw new MissingArgumentValueException(createDefaultArgumentDefinition(source));
                 else
-                    throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value);
+                    throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString());
+            } else if (type.equals(File.class)) {
+                result = value == null ? null : value.asFile();
             } else {
                 Constructor ctor = type.getConstructor(String.class);
-                result = ctor.newInstance(value);
+                result = ctor.newInstance(value == null ? null : value.asString());
             }
         } catch (UserException e) {
             throw e;
diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
index 15ec9dfe5..d77ae67cf 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java
@@ -174,7 +174,7 @@ public abstract class CommandLineProgram {
             ParsingEngine parser = clp.parser = new ParsingEngine(clp);
             parser.addArgumentSource(clp.getClass());
 
-            Map> parsedArgs;
+            Map parsedArgs;
 
             // process the args
             if (clp.canAddArgumentsDynamically()) {
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsedArgs.java b/public/java/src/org/broadinstitute/sting/commandline/ParsedArgs.java
new file mode 100644
index 000000000..9ab315175
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsedArgs.java
@@ -0,0 +1,13 @@
+package org.broadinstitute.sting.commandline;
+
+/**
+ * Represents a collection of parsed arguments for an argument source.
+ *
+ * Useful for printing out help documents.
+ */
+public abstract class ParsedArgs {
+    /**
+     * @return A compact description of the arguments from an provider/source.
+     */
+    public abstract String getDescription();
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsedListArgs.java b/public/java/src/org/broadinstitute/sting/commandline/ParsedListArgs.java
new file mode 100644
index 000000000..a77e73bcf
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsedListArgs.java
@@ -0,0 +1,30 @@
+package org.broadinstitute.sting.commandline;
+
+import org.apache.commons.lang.StringUtils;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+/**
+ * A list of string arguments, usually from the command line or an args list file.
+ */
+public class ParsedListArgs extends ParsedArgs {
+    private final List args = new ArrayList();
+
+    public ParsedListArgs() {
+    }
+
+    public ParsedListArgs(List args) {
+        this.args.addAll(args);
+    }
+
+    public void add(String... args) {
+        this.args.addAll(Arrays.asList(args));
+    }
+
+    @Override
+    public String getDescription() {
+        return StringUtils.join(this.args, " ");
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
index 0fac195e1..a8b729be4 100755
--- a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngine.java
@@ -30,6 +30,7 @@ import org.apache.commons.io.FileUtils;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.classloader.JVMUtils;
+import org.broadinstitute.sting.utils.classloader.PluginManager;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
@@ -61,7 +62,7 @@ public class ParsingEngine {
      * Indicates as best as possible where command-line text remains unmatched
      * to existing arguments.
      */
-    ArgumentMatches argumentMatches = null;
+    private ArgumentMatches argumentMatches = null;
 
     /**
      * Techniques for parsing and for argument lookup.
@@ -88,7 +89,10 @@ public class ParsingEngine {
     /**
      * List of tags associated with the given instantiation of the command-line argument.
      */
-    private final Map tags = new IdentityHashMap();    
+    private final Map tags = new IdentityHashMap();
+
+    private PluginManager argumentProviderPluginManager =
+            new PluginManager(ParsingEngineArgumentProvider.class);
 
     /**
      * our log, which we want to capture anything from org.broadinstitute.sting
@@ -105,7 +109,10 @@ public class ParsingEngine {
             argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors());
         argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS);
 
-        addArgumentSource(ParsingEngineArgumentFiles.class);
+        List> providers = argumentProviderPluginManager.getPlugins();
+        for (Class provider: providers) {
+            addArgumentSource(provider);
+        }
     }
 
     /**
@@ -117,6 +124,10 @@ public class ParsingEngine {
         addArgumentSource(null, source);
     }
 
+    public ArgumentMatches getArgumentMatches() {
+        return argumentMatches;
+    }
+
     /**
      * Add an argument source.  Argument sources are expected to have
      * any number of fields with an @Argument annotation attached.
@@ -156,29 +167,30 @@ public class ParsingEngine {
      * @param tokens Tokens passed on the command line.
      * @return The parsed arguments by file.
      */
-    public SortedMap> parse( String[] tokens ) {
+    public SortedMap parse( String[] tokens ) {
         argumentMatches = new ArgumentMatches();
-        SortedMap> parsedArgs = new TreeMap>();
+        SortedMap parsedArgs = new TreeMap();
 
         List cmdLineTokens = Arrays.asList(tokens);
         parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs);
 
-        ParsingEngineArgumentFiles argumentFiles = new ParsingEngineArgumentFiles();
+        List providers = argumentProviderPluginManager.createAllTypes();
 
-        // Load the arguments ONLY into the argument files.
-        // Validation may optionally run on the rest of the arguments.
-        loadArgumentsIntoObject(argumentFiles);
+        for (ParsingEngineArgumentProvider provider: providers) {
+            // Load the arguments ONLY into the provider.
+            // Validation may optionally run on the rest of the arguments.
+            loadArgumentsIntoObject(provider);
+        }
 
-        for (File file: argumentFiles.files) {
-            List fileTokens = getArguments(file);
-            parse(new ArgumentMatchSource(file), fileTokens, argumentMatches, parsedArgs);
+        for (ParsingEngineArgumentProvider provider: providers) {
+            provider.parse(this, parsedArgs);
         }
 
         return parsedArgs;
     }
 
-    private void parse(ArgumentMatchSource matchSource, List tokens,
-                       ArgumentMatches argumentMatches, SortedMap> parsedArgs) {
+    public void parse(ArgumentMatchSource matchSource, List tokens,
+                         ArgumentMatches argumentMatches, SortedMap parsedArgs) {
         ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1);
 
         int i = 0;
@@ -195,19 +207,44 @@ public class ParsingEngine {
             }
             else {
                 if( argumentMatches.hasMatch(lastArgumentMatchSite) &&
-                    !argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite))
-                    argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, token );
+                        !argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite))
+                    argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, new ArgumentMatchStringValue(token) );
                 else
-                    argumentMatches.MissingArgument.addValue( site, token );
+                    argumentMatches.MissingArgument.addValue( site, new ArgumentMatchStringValue(token) );
 
             }
             i++;
         }
 
-        parsedArgs.put(matchSource, tokens);
+        parsedArgs.put(matchSource, new ParsedListArgs(tokens));
     }
 
-    private List getArguments(File file) {
+    public void parsePairs(ArgumentMatchSource matchSource, List> tokens,
+                         ArgumentMatches argumentMatches, ParsedArgs matchSourceArgs,
+                         SortedMap parsedArgs) {
+        int i = 0;
+        for (Pair pair: tokens) {
+
+            ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i);
+            List matchers = Arrays.asList(ArgumentDefinitions.FullNameDefinitionMatcher, ArgumentDefinitions.ShortNameDefinitionMatcher);
+            ArgumentDefinition definition = null;
+            for (DefinitionMatcher matcher: matchers) {
+                definition = argumentDefinitions.findArgumentDefinition( pair.getFirst(), matcher );
+                if (definition != null)
+                    break;
+            }
+            if (definition == null)
+                continue;
+            ArgumentMatch argumentMatch = new ArgumentMatch(pair.getFirst(), definition, site, new Tags());
+            argumentMatches.mergeInto(argumentMatch);
+            argumentMatch.addValue(site, pair.getSecond());
+            i++;
+        }
+
+        parsedArgs.put(matchSource, matchSourceArgs);
+    }
+
+    protected List getArguments(File file) {
         try {
             if (file.getAbsolutePath().endsWith(".list")) {
                 return getListArguments(file);
@@ -283,9 +320,9 @@ public class ParsingEngine {
 
                 // Ensure that the field contents meet the validation criteria specified by the regular expression.
                 for( ArgumentMatch verifiableMatch: verifiableMatches ) {
-                    for( String value: verifiableMatch.values() ) {
-                        if( verifiableArgument.validation != null && !value.matches(verifiableArgument.validation) )
-                            invalidValues.add( new Pair(verifiableArgument, value) );
+                    for( ArgumentMatchValue value: verifiableMatch.values() ) {
+                        if( verifiableArgument.validation != null && !value.asString().matches(verifiableArgument.validation) )
+                            invalidValues.add( new Pair(verifiableArgument, value.asString()) );
                     }
                 }
             }
@@ -629,21 +666,21 @@ class UnmatchedArgumentException extends ArgumentException {
     private static String formatArguments( ArgumentMatch invalidValues ) {
         StringBuilder sb = new StringBuilder();
         for( ArgumentMatchSite site: invalidValues.sites.keySet() )
-            for( String value: invalidValues.sites.get(site) ) {
+            for( ArgumentMatchValue value: invalidValues.sites.get(site) ) {
                 switch (site.getSource().getType()) {
                     case CommandLine:
                         sb.append( String.format("%nInvalid argument value '%s' at position %d.",
-                                value, site.getIndex()) );
+                                value.asString(), site.getIndex()) );
                         break;
-                    case File:
-                        sb.append( String.format("%nInvalid argument value '%s' in file %s at position %d.",
-                                value, site.getSource().getFile().getAbsolutePath(), site.getIndex()) );
+                    case Provider:
+                        sb.append( String.format("%nInvalid argument value '%s' in %s at position %d.",
+                                value.asString(), site.getSource().getDescription(), site.getIndex()) );
                         break;
                     default:
                         throw new RuntimeException( String.format("Unexpected argument match source type: %s",
                                 site.getSource().getType()));
                 }
-                if(value != null && Utils.dupString(' ',value.length()).equals(value))
+                if(value.asString() != null && Utils.dupString(' ',value.asString().length()).equals(value.asString()))
                     sb.append("  Please make sure any line continuation backslashes on your command line are not followed by whitespace.");
             }
         return sb.toString();
@@ -696,12 +733,3 @@ class UnknownEnumeratedValueException extends ArgumentException {
         return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions));
     }
 }
-
-/**
- * Container class to store the list of argument files.
- * The files will be parsed after the command line arguments.
- */
-class ParsingEngineArgumentFiles {
-    @Argument(fullName = "arg_file", shortName = "args", doc = "Reads arguments from the specified file", required = false)
-    public List files = new ArrayList();
-}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java
new file mode 100644
index 000000000..3f3921937
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentFiles.java
@@ -0,0 +1,30 @@
+package org.broadinstitute.sting.commandline;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.SortedMap;
+
+/**
+ * Container class to store the list of argument files.
+ * The files will be parsed after the command line arguments.
+ */
+public class ParsingEngineArgumentFiles extends ParsingEngineArgumentProvider {
+    @Argument(fullName = "arg_file", shortName = "args", doc = "Reads arguments from the specified file", required = false)
+    public List files = new ArrayList();
+
+    @Override
+    public void parse(ParsingEngine parsingEngine, SortedMap parsedArgs) {
+        ArgumentMatches argumentMatches = parsingEngine.getArgumentMatches();
+        for (File file: this.files) {
+            List fileTokens = parsingEngine.getArguments(file);
+            parsingEngine.parse(new ArgumentMatchFileSource(file), fileTokens, argumentMatches, parsedArgs);
+        }
+    }
+}
+
+class ArgumentMatchFileSource extends ArgumentMatchSource {
+    ArgumentMatchFileSource(File file) {
+        super("file " + file.getAbsolutePath());
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java
new file mode 100644
index 000000000..a57f8b08a
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/commandline/ParsingEngineArgumentProvider.java
@@ -0,0 +1,12 @@
+package org.broadinstitute.sting.commandline;
+
+import java.util.List;
+import java.util.SortedMap;
+
+/**
+ * A class that can parse arguments for the engine
+ */
+public abstract class ParsingEngineArgumentProvider {
+    public abstract void parse(ParsingEngine parsingEngine, SortedMap parsedArgs);
+}
+
diff --git a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java
index e0b1154c4..15d134fa2 100644
--- a/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/RodBinding.java
@@ -117,6 +117,15 @@ public final class RodBinding {
         this.bound = true;
     }
 
+    /**
+     * For testing purposes only.  Creates a RodBinding sufficient for looking up associations to rawName
+     * @param type
+     * @param rawName
+     */
+    public RodBinding(Class type, final String rawName) {
+        this(type, rawName, "missing", type.getSimpleName(), new Tags());
+    }
+
     /**
      * Make an unbound RodBinding.  Only available for creating the globally unique UNBOUND object
      * @param type class this unbound RodBinding creates
diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
index 312d31727..0daad2c2b 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java
@@ -112,31 +112,38 @@ public class CommandLineGATK extends CommandLineExecutable {
         }
     }
 
-    protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
-    protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
+    public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
+    public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
+    public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device";
+    public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded";
+
     private static void checkForMaskedUserErrors(final Throwable t) {
         final String message = t.getMessage();
         if ( message == null )
             return;
 
         // we know what to do about the common "Too many open files" error
-        if ( message.indexOf("Too many open files") != -1 )
+        if ( message.contains("Too many open files") )
             exitSystemWithUserError(new UserException.TooManyOpenFiles());
 
         // malformed BAM looks like a SAM file
-        if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 ||
-                message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 )
+        if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) ||
+                message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
             exitSystemWithSamError(t);
 
         // can't close tribble index when writing
-        if ( message.indexOf("Unable to close index for") != -1 )
+        if ( message.contains("Unable to close index for") )
             exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage()));
 
         // disk is full
-        if ( message.indexOf("No space left on device") != -1 )
-            exitSystemWithUserError(new UserException(t.getMessage()));
-        if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 )
-            exitSystemWithUserError(new UserException(t.getCause().getMessage()));
+        if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) )
+            exitSystemWithUserError(new UserException.NoSpaceOnDevice());
+        if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) )
+            exitSystemWithUserError(new UserException.NoSpaceOnDevice());
+
+        // masked out of memory error
+        if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError )
+            exitSystemWithUserError(new UserException.NotEnoughMemory());
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java
deleted file mode 100644
index 6d9e79156..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/DownsamplingMethod.java
+++ /dev/null
@@ -1,52 +0,0 @@
-package org.broadinstitute.sting.gatk;
-
-import org.broadinstitute.sting.utils.exceptions.UserException;
-
-/**
- * Describes the method for downsampling reads at a given locus.
- *
- * @author hanna
- * @version 0.1
- */
-
-public class DownsamplingMethod {
-    /**
-     * Type of downsampling to perform.
-     */
-    public final DownsampleType type;
-
-    /**
-     * Actual downsampling target is specified as an integer number of reads.
-     */
-    public final Integer toCoverage;
-
-    /**
-     * Actual downsampling target is specified as a fraction of total available reads.
-     */
-    public final Double toFraction;
-
-    /**
-     * Expresses no downsampling applied at all.
-     */
-    public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null);
-
-    public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) {
-        // Do some basic sanity checks on the downsampling parameters passed in.
-
-        // Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator.
-        if(type != DownsampleType.NONE && toFraction == null && toCoverage == null)
-            throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
-
-        // Fraction and coverage cannot both be specified.
-        if(toFraction != null && toCoverage != null)
-            throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified.  Please choose only one.");
-
-        // Experimental by sample downsampling does not work with a fraction of reads.
-        if(type == DownsampleType.BY_SAMPLE && toFraction != null)
-            throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method");
-
-        this.type = type;
-        this.toCoverage = toCoverage;
-        this.toFraction = toFraction;
-    }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
index e76cde43a..b7000e0ee 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java
@@ -24,25 +24,28 @@
 
 package org.broadinstitute.sting.gatk;
 
+import com.google.java.contract.Ensures;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
 import net.sf.picard.reference.ReferenceSequenceFile;
 import net.sf.samtools.SAMFileHeader;
 import net.sf.samtools.SAMRecord;
 import net.sf.samtools.SAMSequenceDictionary;
 import org.apache.log4j.Logger;
-import org.broad.tribble.readers.PositionalBufferedStream;
 import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
 import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
 import org.broadinstitute.sting.gatk.datasources.reads.*;
 import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
 import org.broadinstitute.sting.gatk.executive.MicroScheduler;
 import org.broadinstitute.sting.gatk.filters.FilterManager;
 import org.broadinstitute.sting.gatk.filters.ReadFilter;
 import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
 import org.broadinstitute.sting.gatk.io.OutputTracker;
 import org.broadinstitute.sting.gatk.io.stubs.Stub;
+import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
+import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
 import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
 import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
 import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
@@ -50,21 +53,18 @@ import org.broadinstitute.sting.gatk.samples.SampleDB;
 import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
 import org.broadinstitute.sting.gatk.walkers.*;
 import org.broadinstitute.sting.utils.*;
-import org.broadinstitute.sting.utils.baq.BAQ;
 import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
-import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
+import org.broadinstitute.sting.utils.classloader.PluginManager;
 import org.broadinstitute.sting.utils.collections.Pair;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.interval.IntervalUtils;
 import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
-import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
+import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
 
 import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
 import java.util.*;
+import java.util.concurrent.TimeUnit;
 
 /**
  * A GenomeAnalysisEngine that runs a specified walker.
@@ -74,6 +74,7 @@ public class GenomeAnalysisEngine {
      * our log, which we want to capture anything from this class
      */
     private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class);
+    public static final long NO_RUNTIME_LIMIT = -1;
 
     /**
      * The GATK command-line argument parsing code.
@@ -136,11 +137,18 @@ public class GenomeAnalysisEngine {
      */
     private Collection filters;
 
+    /**
+     * Collection of the read transformers applied to the reads
+     */
+    private List readTransformers;
+
     /**
      * Controls the allocation of threads between CPU vs IO.
      */
     private ThreadAllocation threadAllocation;
 
+    private ReadMetrics cumulativeMetrics = null;
+
     /**
      * A currently hacky unique name for this GATK instance
      */
@@ -175,6 +183,13 @@ public class GenomeAnalysisEngine {
      */
     private Collection referenceMetaDataFiles;
 
+    /**
+     * The threading efficiency monitor we use in the GATK to monitor our efficiency.
+     *
+     * May be null if one isn't active, or hasn't be initialized yet
+     */
+    private ThreadEfficiencyMonitor threadEfficiencyMonitor = null;
+
     /**
      * Set the reference metadata files to use for this traversal.
      * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse.
@@ -252,6 +267,7 @@ public class GenomeAnalysisEngine {
 
         // our microscheduler, which is in charge of running everything
         MicroScheduler microScheduler = createMicroscheduler();
+        threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor();
 
         // create temp directories as necessary
         initializeTempDirectory();
@@ -280,6 +296,8 @@ public class GenomeAnalysisEngine {
     static {
         deprecatedGATKWalkers.put("CountCovariates", "2.0");
         deprecatedGATKWalkers.put("TableRecalibration", "2.0");
+        deprecatedGATKWalkers.put("AlignmentWalker", "2.2");
+        deprecatedGATKWalkers.put("CountBestAlignments", "2.2");
     }
 
     /**
@@ -349,32 +367,59 @@ public class GenomeAnalysisEngine {
         return Collections.unmodifiableList(filters);
     }
 
+    /**
+     * Returns a list of active, initialized read transformers
+     *
+     * @param walker the walker we need to apply read transformers too
+     * @return a non-null list of read transformers
+     */
+    public void initializeReadTransformers(final Walker walker) {
+        final List activeTransformers = new ArrayList();
+
+        final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class);
+        final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null;
+
+        final PluginManager pluginManager = new PluginManager(ReadTransformer.class);
+
+        for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) {
+            transformer.initialize(overrideTime, this, walker);
+            if ( transformer.enabled() )
+                activeTransformers.add(transformer);
+        }
+
+        setReadTransformers(activeTransformers);
+    }
+
+    public List getReadTransformers() {
+        return readTransformers;
+    }
+
+    private void setReadTransformers(final List readTransformers) {
+        if ( readTransformers == null )
+            throw new ReviewedStingException("read transformers cannot be null");
+        this.readTransformers = readTransformers;
+    }
+
     /**
      * Parse out the thread allocation from the given command-line argument.
      */
     private void determineThreadAllocation() {
-        Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
+        if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads);
+        if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread);
+        if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads);
 
-        // TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters.
-        Integer numCPUThreads = null;
-        if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null)
-            throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument.  Please specify only one or the other.");
-        else if(tags.containsKey("cpu"))
-            numCPUThreads = Integer.parseInt(tags.getValue("cpu"));
-        else if(argCollection.numberOfCPUThreads != null)
-            numCPUThreads = argCollection.numberOfCPUThreads;
-
-        Integer numIOThreads = null;
-        if(tags.containsKey("io") && argCollection.numberOfIOThreads != null)
-            throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument.  Please specify only one or the other.");
-        else if(tags.containsKey("io"))
-            numIOThreads = Integer.parseInt(tags.getValue("io"));
-        else if(argCollection.numberOfIOThreads != null)
-            numIOThreads = argCollection.numberOfIOThreads;
-
-        this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
+        this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads,
+                argCollection.numberOfCPUThreadsPerDataThread,
+                argCollection.numberOfIOThreads,
+                argCollection.monitorThreadEfficiency);
     }
 
+    public int getTotalNumberOfThreads() {
+        return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads();
+    }
+
+
+
     /**
      * Allow subclasses and others within this package direct access to the walker manager.
      * @return The walker manager used by this package.
@@ -400,23 +445,19 @@ public class GenomeAnalysisEngine {
 
     protected DownsamplingMethod getDownsamplingMethod() {
         GATKArgumentCollection argCollection = this.getArguments();
-        DownsamplingMethod method;
-        if(argCollection.getDownsamplingMethod() != null)
-            method = argCollection.getDownsamplingMethod();
-        else if(WalkerManager.getDownsamplingMethod(walker) != null)
-            method = WalkerManager.getDownsamplingMethod(walker);
-        else
-            method = GATKArgumentCollection.getDefaultDownsamplingMethod();
-        return method;
+        boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling;
+
+        DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod();
+        DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling);
+        DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling);
+
+        return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
     }
 
     protected void setDownsamplingMethod(DownsamplingMethod method) {
         argCollection.setDownsamplingMethod(method);
     }
 
-    public BAQ.QualityMode getWalkerBAQQualityMode()         { return WalkerManager.getBAQQualityMode(walker); }
-    public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); }    
-
     protected boolean includeReadsWithDeletionAtLoci() {
         return walker.includeReadsWithDeletionAtLoci();
     }
@@ -504,6 +545,7 @@ public class GenomeAnalysisEngine {
      */
     protected Iterable getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
         ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
+        DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null;
         ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
 
         // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
@@ -538,10 +580,15 @@ public class GenomeAnalysisEngine {
                         throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
                 }
 
+                // Use the experimental ReadShardBalancer if experimental downsampling is enabled
+                ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ?
+                                                  new ExperimentalReadShardBalancer() :
+                                                  new ReadShardBalancer();
+
                 if(intervals == null)
-                    return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
+                    return readsDataSource.createShardIteratorOverAllReads(readShardBalancer);
                 else
-                    return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer());
+                    return readsDataSource.createShardIteratorOverIntervals(intervals, readShardBalancer);
             }
             else
                 throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
@@ -639,14 +686,14 @@ public class GenomeAnalysisEngine {
 
         // if include argument isn't given, create new set of all possible intervals
 
-        Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
+        final Pair includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
                 this.referenceDataSource,
                 argCollection.intervals,
                 argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
                 argCollection.excludeIntervals);
 
-        GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
-        GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
+        final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
+        final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
 
         // if no exclude arguments, can return parseIntervalArguments directly
         if ( excludeSortedSet == null )
@@ -657,13 +704,15 @@ public class GenomeAnalysisEngine {
             intervals = includeSortedSet.subtractRegions(excludeSortedSet);
 
             // logging messages only printed when exclude (-XL) arguments are given
-            long toPruneSize = includeSortedSet.coveredSize();
-            long toExcludeSize = excludeSortedSet.coveredSize();
-            long intervalSize = intervals.coveredSize();
+            final long toPruneSize = includeSortedSet.coveredSize();
+            final long toExcludeSize = excludeSortedSet.coveredSize();
+            final long intervalSize = intervals.coveredSize();
             logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
             logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
                     toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
         }
+
+        logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
     }
 
     /**
@@ -697,13 +746,12 @@ public class GenomeAnalysisEngine {
     protected void initializeDataSources() {
         logger.info("Strictness is " + argCollection.strictnessLevel);
 
-        // TODO -- REMOVE ME
-        BAQ.DEFAULT_GOP = argCollection.BAQGOP;
-
         validateSuppliedReference();
         setReferenceDataSource(argCollection.referenceFile);
 
         validateSuppliedReads();
+        initializeReadTransformers(walker);
+
         readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference());
 
         for (ReadFilter filter : filters)
@@ -784,14 +832,13 @@ public class GenomeAnalysisEngine {
      * @return A data source for the given set of reads.
      */
     private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
-        DownsamplingMethod method = getDownsamplingMethod();
+        DownsamplingMethod downsamplingMethod = getDownsamplingMethod();
 
         // Synchronize the method back into the collection so that it shows up when
         // interrogating for the downsample method during command line recreation.
-        setDownsamplingMethod(method);
+        setDownsamplingMethod(downsamplingMethod);
 
-        if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF)
-            throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested.");
+        logger.info(downsamplingMethod);
 
         if (argCollection.removeProgramRecords && argCollection.keepProgramRecords)
             throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options");
@@ -809,14 +856,11 @@ public class GenomeAnalysisEngine {
                 argCollection.useOriginalBaseQualities,
                 argCollection.strictnessLevel,
                 argCollection.readBufferSize,
-                method,
+                downsamplingMethod,
                 new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
                 filters,
+                readTransformers,
                 includeReadsWithDeletionAtLoci(),
-                getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
-                getWalkerBAQQualityMode(),
-                refReader,
-                getBaseRecalibration(),
                 argCollection.defaultBaseQualities,
                 removeProgramRecords);
     }
@@ -943,6 +987,22 @@ public class GenomeAnalysisEngine {
         return this.intervals;
     }
 
+    /**
+     * Get the list of regions of the genome being processed.  If the user
+     * requested specific intervals, return those, otherwise return regions
+     * corresponding to the entire genome.  Never returns null.
+     *
+     * @return a non-null set of intervals being processed
+     */
+    @Ensures("result != null")
+    public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() {
+        if ( getIntervals() == null )
+            // if we don't have any intervals defined, create intervals from the reference itself
+            return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary());
+        else
+            return getIntervals();
+    }
+
     /**
      * Gets the list of filters employed by this engine.
      * @return Collection of filters (actual instances) used by this engine.
@@ -1000,7 +1060,19 @@ public class GenomeAnalysisEngine {
      *         owned by the caller; the caller can do with the object what they wish.
      */
     public ReadMetrics getCumulativeMetrics() {
-        return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics();
+        // todo -- probably shouldn't be lazy
+        if ( cumulativeMetrics == null )
+            cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics();
+        return cumulativeMetrics;
+    }
+
+    /**
+     * Return the global ThreadEfficiencyMonitor, if there is one
+     *
+     * @return the monitor, or null if none is active
+     */
+    public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() {
+        return threadEfficiencyMonitor;
     }
 
     // -------------------------------------------------------------------------------------
@@ -1020,6 +1092,33 @@ public class GenomeAnalysisEngine {
     public String createApproximateCommandLineArgumentString(Object... argumentProviders) {
         return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders);
     }
-    
 
+    /**
+     * Does the current runtime in unit exceed the runtime limit, if one has been provided?
+     *
+     * @param runtime the runtime of this GATK instance in minutes
+     * @param unit the time unit of runtime
+     * @return false if not limit was requested or if runtime <= the limit, true otherwise
+     */
+    public boolean exceedsRuntimeLimit(final long runtime, final TimeUnit unit) {
+        if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime);
+
+        if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT )
+            return false;
+        else {
+            final long actualRuntimeNano = TimeUnit.NANOSECONDS.convert(runtime, unit);
+            final long maxRuntimeNano = getRuntimeLimitInNanoseconds();
+            return actualRuntimeNano > maxRuntimeNano;
+        }
+    }
+
+    /**
+     * @return the runtime limit in nanoseconds, or -1 if no limit was specified
+     */
+    public long getRuntimeLimitInNanoseconds() {
+        if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT )
+            return -1;
+        else
+            return TimeUnit.NANOSECONDS.convert(getArguments().maxRuntime, getArguments().maxRuntimeUnits);
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java
index ceaa30f01..bfea0b1e1 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/ReadMetrics.java
@@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk;
 import net.sf.picard.filter.SamRecordFilter;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 
-import java.util.Collections;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.TreeMap;
@@ -119,11 +118,18 @@ public class ReadMetrics implements Cloneable {
         return nRecords;
     }
 
+    /**
+     * Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
+     */
+    public void incrementNumIterations(final long by) {
+        nRecords += by;
+    }
+
     /**
      * Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
      */
     public void incrementNumIterations() {
-        nRecords++;
+        incrementNumIterations(1);
     }
 
     public long getNumReadsSeen() {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
index e02b9d5af..c37def397 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/ReadProperties.java
@@ -1,15 +1,15 @@
 package org.broadinstitute.sting.gatk;
 
-import net.sf.picard.reference.IndexedFastaSequenceFile;
 import net.sf.samtools.SAMFileHeader;
 import net.sf.samtools.SAMFileReader;
 import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
 import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
+import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
 import org.broadinstitute.sting.gatk.filters.ReadFilter;
-import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
+import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
 
 import java.util.Collection;
+import java.util.List;
 /**
  * User: hanna
  * Date: May 14, 2009
@@ -30,16 +30,14 @@ import java.util.Collection;
 public class ReadProperties {
     private final Collection readers;
     private final SAMFileHeader header;
+    private final SAMFileHeader.SortOrder sortOrder;
     private final SAMFileReader.ValidationStringency validationStringency;
     private final DownsamplingMethod downsamplingMethod;
     private final ValidationExclusion exclusionList;
     private final Collection supplementalFilters;
+    private final List readTransformers;
     private final boolean includeReadsWithDeletionAtLoci;
     private final boolean useOriginalBaseQualities;
-    private final BAQ.CalculationMode cmode;
-    private final BAQ.QualityMode qmode;
-    private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
-    private final BaseRecalibration bqsrApplier;
     private final byte defaultBaseQualities;
 
     /**
@@ -67,6 +65,14 @@ public class ReadProperties {
         return header;
     }
 
+    /**
+     * Gets the sort order of the reads
+     * @return the sort order of the reads
+     */
+    public SAMFileHeader.SortOrder getSortOrder() {
+        return sortOrder;
+    }
+
     /**
      * How strict should validation be?
      * @return Stringency of validation.
@@ -95,6 +101,11 @@ public class ReadProperties {
         return supplementalFilters;
     }
 
+
+    public List getReadTransformers() {
+        return readTransformers;
+    }
+
     /**
      * Return whether to use original base qualities.
      * @return Whether to use original base qualities.
@@ -103,16 +114,6 @@ public class ReadProperties {
         return useOriginalBaseQualities;
     }
 
-
-    public BAQ.QualityMode getBAQQualityMode() { return qmode; }
-    public BAQ.CalculationMode getBAQCalculationMode() { return cmode; }
-
-    public IndexedFastaSequenceFile getRefReader() {
-        return refReader;
-    }
-
-    public BaseRecalibration getBQSRApplier() { return bqsrApplier; }
-
     /**
      * @return Default base quality value to fill reads missing base quality information.
      */
@@ -134,36 +135,29 @@ public class ReadProperties {
      * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
      *         will explicitly list reads with deletion over the current reference base; otherwise, only observed
      *        bases will be seen in the pileups, and the deletions will be skipped silently.
-     * @param cmode How should we apply the BAQ calculation to the reads?
-     * @param qmode How should we apply the BAQ calculation to the reads?
-     * @param refReader if applyBAQ is true, must be a valid pointer to a indexed fasta file reads so we can get the ref bases for BAQ calculation
      * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
      */
     public ReadProperties( Collection samFiles,
            SAMFileHeader header,
+           SAMFileHeader.SortOrder sortOrder,
            boolean useOriginalBaseQualities,
            SAMFileReader.ValidationStringency strictness,
            DownsamplingMethod downsamplingMethod,
            ValidationExclusion exclusionList,
            Collection supplementalFilters,
+           List readTransformers,
            boolean includeReadsWithDeletionAtLoci,
-           BAQ.CalculationMode cmode,
-           BAQ.QualityMode qmode,           
-           IndexedFastaSequenceFile refReader,
-           BaseRecalibration bqsrApplier,
            byte defaultBaseQualities) {
         this.readers = samFiles;
         this.header = header;
+        this.sortOrder = sortOrder;
         this.validationStringency = strictness;
         this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod;
         this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
         this.supplementalFilters = supplementalFilters;
+        this.readTransformers = readTransformers;
         this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
         this.useOriginalBaseQualities = useOriginalBaseQualities;
-        this.cmode = cmode;
-        this.qmode = qmode;
-        this.refReader = refReader;
-        this.bqsrApplier = bqsrApplier;
         this.defaultBaseQualities = defaultBaseQualities;
     }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java
index 8843d4bfe..fbacbddc4 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/WalkerManager.java
@@ -27,15 +27,18 @@ package org.broadinstitute.sting.gatk;
 
 import org.broadinstitute.sting.commandline.Hidden;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
+import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
 import org.broadinstitute.sting.gatk.filters.FilterManager;
 import org.broadinstitute.sting.gatk.filters.ReadFilter;
+import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
 import org.broadinstitute.sting.gatk.walkers.*;
-import org.broadinstitute.sting.utils.baq.BAQ;
 import org.broadinstitute.sting.utils.classloader.PluginManager;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet;
 import org.broadinstitute.sting.utils.text.TextFormattingUtils;
 
+import java.lang.annotation.Annotation;
 import java.util.*;
 
 /**
@@ -303,9 +306,10 @@ public class WalkerManager extends PluginManager {
      * downsampling method is specified on the command-line, the command-line version will
      * be used instead.
      * @param walkerClass The class of the walker to interrogate.
+     * @param useExperimentalDownsampling If true, use the experimental downsampling implementation
      * @return The downsampling method, as specified by the walker.  Null if none exists.
      */
-    public static DownsamplingMethod getDownsamplingMethod(Class walkerClass) {
+    public static DownsamplingMethod getDownsamplingMethod(Class walkerClass, boolean useExperimentalDownsampling) {
         DownsamplingMethod downsamplingMethod = null;
 
         if( walkerClass.isAnnotationPresent(Downsample.class) ) {
@@ -313,17 +317,17 @@ public class WalkerManager extends PluginManager {
             DownsampleType type = downsampleParameters.by();
             Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
             Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
-            downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction);
+            downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling);
         }
 
         return downsamplingMethod;
     }
 
-    public static BAQ.QualityMode getBAQQualityMode(Walker walker) {
-        return walker.getClass().getAnnotation(BAQMode.class).QualityMode();
+    public static  T getWalkerAnnotation(final Walker walker, final Class clazz) {
+        return walker.getClass().getAnnotation(clazz);
     }
 
-    public static BAQ.ApplicationTime getBAQApplicationTime(Walker walker) {
+    public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) {
         return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime();
     }    
 
@@ -332,10 +336,11 @@ public class WalkerManager extends PluginManager {
      * downsampling method is specified on the command-line, the command-line version will
      * be used instead.
      * @param walker The walker to interrogate.
+     * @param useExperimentalDownsampling If true, use the experimental downsampling implementation
      * @return The downsampling method, as specified by the walker.  Null if none exists.
      */
-    public static DownsamplingMethod getDownsamplingMethod(Walker walker) {
-        return getDownsamplingMethod(walker.getClass());
+    public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) {
+        return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling);
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
index bbbd96cf1..e2b943582 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java
@@ -31,8 +31,9 @@ import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.commandline.Hidden;
 import org.broadinstitute.sting.commandline.Input;
 import org.broadinstitute.sting.commandline.IntervalBinding;
-import org.broadinstitute.sting.gatk.DownsampleType;
-import org.broadinstitute.sting.gatk.DownsamplingMethod;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
+import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
 import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
 import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
 import org.broadinstitute.sting.utils.QualityUtils;
@@ -41,7 +42,10 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
 import org.broadinstitute.sting.utils.interval.IntervalSetRule;
 
 import java.io.File;
-import java.util.*;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.concurrent.TimeUnit;
 
 /**
  * @author aaron
@@ -64,12 +68,35 @@ public class GATKArgumentCollection {
     @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
     public Integer readBufferSize = null;
 
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // GATKRunReport options
+    //
+    // --------------------------------------------------------------------------------------------------------------
+
     @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
     public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
 
     @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
     public File gatkKeyFile = null;
 
+    /**
+     * The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary String tag that can be
+     * used to group together runs during later analysis.  One use of this capability is to tag runs as GATK
+     * performance tests, so that the performance of the GATK over time can be assessed from the logs directly.
+     *
+     * Note that the tags do not conform to any ontology, so you are free to use any tags that you might find
+     * meaningful.
+     */
+    @Argument(fullName = "tag", shortName = "tag", doc="Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis", required = false)
+    public String tag = "NA";
+
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // General features
+    //
+    // --------------------------------------------------------------------------------------------------------------
+
     @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
     public List readFilters = new ArrayList();
 
@@ -115,15 +142,20 @@ public class GATKArgumentCollection {
     @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
     public boolean nonDeterministicRandomSeed = false;
 
-    /**
-     * The override mechanism in the GATK, by default, populates the command-line arguments, then
-     * the defaults from the walker annotations.  Unfortunately, walker annotations should be trumped
-     * by a user explicitly specifying command-line arguments.
-     * TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments.
-     */
-    private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
-    private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000;
+    @Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
+    public boolean disableRandomization = false;
 
+    @Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure.  By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false)
+    public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT;
+
+    @Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="The TimeUnit for maxRuntime", required = false)
+    public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES;
+
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // Downsampling Arguments
+    //
+    // --------------------------------------------------------------------------------------------------------------
     @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus.  Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
     public DownsampleType downsamplingType = null;
 
@@ -133,17 +165,20 @@ public class GATKArgumentCollection {
     @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
     public Integer downsampleCoverage = null;
 
+    @Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false)
+    @Hidden
+    public boolean enableExperimentalDownsampling = false;
+
     /**
      * Gets the downsampling method explicitly specified by the user.  If the user didn't specify
      * a default downsampling mechanism, return the default.
      * @return The explicitly specified downsampling mechanism, or the default if none exists.
      */
     public DownsamplingMethod getDownsamplingMethod() {
-        if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null)
+        if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null )
             return null;
-        if(downsamplingType == null && downsampleCoverage != null)
-            return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null);
-        return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
+
+        return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling);
     }
 
     /**
@@ -153,9 +188,11 @@ public class GATKArgumentCollection {
     public void setDownsamplingMethod(DownsamplingMethod method) {
         if (method == null)
             throw new IllegalArgumentException("method is null");
+
         downsamplingType = method.type;
         downsampleCoverage = method.toCoverage;
         downsampleFraction = method.toFraction;
+        enableExperimentalDownsampling = method.useExperimentalDownsampling;
     }
 
     // --------------------------------------------------------------------------------------------------------------
@@ -174,17 +211,14 @@ public class GATKArgumentCollection {
     // performance log arguments
     //
     // --------------------------------------------------------------------------------------------------------------
-    @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
-    public File performanceLog = null;
 
     /**
-     * Gets the default downsampling method, returned if the user didn't specify any downsampling
-     * method.
-     * @return The default downsampling mechanism, or null if none exists.
+     * The file name for the GATK performance log output, or null if you don't want to generate the
+     * detailed performance logging table.  This table is suitable for importing into R or any
+     * other analysis software that can read tsv files
      */
-    public static DownsamplingMethod getDefaultDownsamplingMethod() {
-        return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
-    }
+    @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
+    public File performanceLog = null;
 
     @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
     public Boolean useOriginalBaseQualities = false;
@@ -256,20 +290,40 @@ public class GATKArgumentCollection {
     @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime.  For expert users only who know what they are doing.  We do not support usage of this argument.", required = false)
     public ValidationExclusion.TYPE unsafe;
 
-    /** How many threads should be allocated to this analysis. */
-    @Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
-    public Integer numberOfThreads = 1;
+    // --------------------------------------------------------------------------------------------------------------
+    //
+    // Multi-threading arguments
+    //
+    // --------------------------------------------------------------------------------------------------------------
 
     /**
-     * The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types.
-     * TODO: Kill this when I can do a tagged integer in Queue.
+     * How many data threads should be allocated to this analysis?  Data threads contains N cpu threads per
+     * data thread, and act as completely data parallel processing, increasing the memory usage of GATK
+     * by M data threads.  Data threads generally scale extremely effectively, up to 24 cores
      */
-    @Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false)
-    @Hidden
-    public Integer numberOfCPUThreads = null;
+    @Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false)
+    public Integer numberOfDataThreads = 1;
+
+    /**
+     * How many CPU threads should be allocated per data thread?  Each CPU thread operates the map
+     * cycle independently, but may run into earlier scaling problems with IO than data threads.  Has
+     * the benefit of not requiring X times as much memory per thread as data threads do, but rather
+     * only a constant overhead.
+     */
+    @Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false)
+    public int numberOfCPUThreadsPerDataThread = 1;
+
     @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
     @Hidden
-    public Integer numberOfIOThreads = null;
+    public int numberOfIOThreads = 0;
+
+    /**
+     * Enable GATK to monitor its own threading efficiency, at a itsy-bitsy tiny
+     * cost (< 0.1%) in runtime because of turning on the JavaBean.  This is largely for
+     * debugging purposes.
+     */
+    @Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable GATK threading efficiency monitoring", required = false)
+    public Boolean monitorThreadEfficiency = false;
 
     @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
     public Integer numberOfBAMFileHandles = null;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
index f30fc0316..547f375bb 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/StandardCallerArgumentCollection.java
@@ -1,13 +1,14 @@
 package org.broadinstitute.sting.gatk.arguments;
 
-import org.broadinstitute.sting.commandline.Advanced;
-import org.broadinstitute.sting.commandline.Argument;
-import org.broadinstitute.sting.commandline.Input;
-import org.broadinstitute.sting.commandline.RodBinding;
+import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
 import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
+import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 
+import java.io.File;
+import java.io.PrintStream;
+
 /**
  * Created with IntelliJ IDEA.
  * User: rpoplin
@@ -55,8 +56,51 @@ public class StandardCallerArgumentCollection {
      * then only this many alleles will be used.  Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
      * scales exponentially based on the number of alternate alleles.  Unless there is a good reason to change the default value, we highly recommend
      * that you not play around with this parameter.
+     *
+     * As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
      */
     @Advanced
     @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
-    public int MAX_ALTERNATE_ALLELES = 3;
+    public int MAX_ALTERNATE_ALLELES = 6;
+
+    /**
+     * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
+     */
+    @Advanced
+    @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
+    public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
+
+    /**
+     * If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
+     * Basically, it will ignore the contamination fraction of reads for each alternate allele.  So if the pileup contains N total bases, then we
+     * will try to remove (N * contamination fraction) bases for each alternate allele.
+     */
+    @Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
+    public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
+    public static final double DEFAULT_CONTAMINATION_FRACTION = 0.05;
+
+    @Hidden
+    @Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false)
+    public PrintStream contaminationLog = null;
+
+    @Hidden
+    @Argument(shortName = "logExactCalls", doc="x", required=false)
+    public File exactCallsLog = null;
+
+    public StandardCallerArgumentCollection() { }
+
+    // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
+    public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) {
+        this.alleles = SCAC.alleles;
+        this.GenotypingMode = SCAC.GenotypingMode;
+        this.heterozygosity = SCAC.heterozygosity;
+        this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
+        this.OutputMode = SCAC.OutputMode;
+        this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
+        this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
+        this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
+        this.contaminationLog = SCAC.contaminationLog;
+        this.exactCallsLog = SCAC.exactCallsLog;
+        this.AFmodel = SCAC.AFmodel;
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java
index 1290319e2..af330bba9 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java
@@ -177,7 +177,7 @@ public class ReferenceContext {
      * @return The base at the given locus from the reference.
      */
     public byte getBase() {
-        return getBases()[(int)(locus.getStart() - window.getStart())];
+        return getBases()[(locus.getStart() - window.getStart())];
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java
new file mode 100644
index 000000000..1e39d6836
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/IntervalOverlappingRODsFromStream.java
@@ -0,0 +1,143 @@
+package org.broadinstitute.sting.gatk.datasources.providers;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import net.sf.picard.util.PeekableIterator;
+import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl;
+import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
+import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
+import org.broadinstitute.sting.utils.GenomeLoc;
+
+import java.util.Collection;
+import java.util.LinkedList;
+import java.util.ListIterator;
+
+/**
+ * Key algorithmic helper for ReadBasedReferenceOrderedData
+ *
+ * Takes a single iterator of features, and provides a single capability that returns
+ * the list of RODs that overlap an interval.  Allows sequential getOverlapping calls
+ * from intervals provided that these intervals always have increasing getStart() values.
+ *
+ */
+class IntervalOverlappingRODsFromStream {
+    /**
+     * Only held for QC purposes
+     */
+    GenomeLoc lastQuery = null;
+
+    private final String name;
+    private final LinkedList currentFeatures = new LinkedList();
+    private final PeekableIterator futureFeatures;
+
+    /**
+     * Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and
+     * returns RODRecordLists having name
+     *
+     * @param name
+     * @param futureFeatures
+     */
+    IntervalOverlappingRODsFromStream(final String name, final PeekableIterator futureFeatures) {
+        if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null");
+
+        this.name = name;
+        this.futureFeatures = futureFeatures;
+    }
+
+    /**
+     * Get the list of RODs overlapping loc from this stream of RODs.
+     *
+     * Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart
+     *
+     * @param loc the interval to query
+     * @return a non-null RODRecordList containing the overlapping RODs, which may be empty
+     */
+    @Ensures({"overlaps(loc, result)",
+            "! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)",
+            "result != null"})
+    public RODRecordList getOverlapping(final GenomeLoc loc) {
+        if ( lastQuery != null && loc.getStart() < lastQuery.getStart() )
+            throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery));
+
+        trimCurrentFeaturesToLoc(loc);
+        readOverlappingFutureFeatures(loc);
+        return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc);
+    }
+
+
+    /**
+     * For contract assurance.  Checks that all bindings in loc overlap
+     *
+     * @param loc
+     * @param bindings
+     * @return
+     */
+    @Requires({"loc != null", "bindings != null"})
+    private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) {
+        for ( final GATKFeature feature : bindings )
+            if ( ! feature.getLocation().overlapsP(loc) )
+                return false;
+        return true;
+    }
+
+    /**
+     * Subset the features in all to those that overlap with loc
+     *
+     * The current features list contains everything read that cannot be thrown away yet, but not
+     * everything in there necessarily overlaps with loc.  Subset to just those that do overlap
+     *
+     * @param loc the location that features must overlap
+     * @param all the list of all features
+     * @return a subset of all that overlaps with loc
+     */
+    @Requires({"loc != null", "all != null"})
+    @Ensures("result.size() <= all.size()")
+    private Collection subsetToOverlapping(final GenomeLoc loc, final Collection all) {
+        final LinkedList overlapping = new LinkedList();
+        for ( final GATKFeature feature : all )
+            if ( feature.getLocation().overlapsP(loc) )
+                overlapping.add(feature);
+        return overlapping;
+    }
+
+    /**
+     * Update function.  Remove all elements of currentFeatures that end before loc
+     *
+     * @param loc the location to use
+     */
+    @Requires("loc != null")
+    @Ensures("currentFeatures.size() <= old(currentFeatures.size())")
+    private void trimCurrentFeaturesToLoc(final GenomeLoc loc) {
+        final ListIterator it = currentFeatures.listIterator();
+        while ( it.hasNext() ) {
+            final GATKFeature feature = it.next();
+            if ( feature.getLocation().isBefore(loc) )
+                it.remove();
+        }
+    }
+
+    /**
+     * Update function: Read all elements from futureFeatures that overlap with loc
+     *
+     * Stops at the first element that starts before the end of loc, or the stream empties
+     *
+     * @param loc
+     */
+    @Requires("loc != null")
+    @Ensures("currentFeatures.size() >= old(currentFeatures.size())")
+    private void readOverlappingFutureFeatures(final GenomeLoc loc) {
+        while ( futureFeatures.hasNext() ) {
+            final GenomeLoc nextLoc = futureFeatures.peek().getLocation();
+            if ( nextLoc.isBefore(loc) ) {
+                futureFeatures.next(); // next rod element is before loc, throw it away and keep looking
+            } else if ( nextLoc.isPast(loc) ) {
+                break; // next element is past loc, stop looking but don't pop it
+            } else if ( nextLoc.overlapsP(loc) ) {
+                // add overlapping elements to our current features, removing from stream
+                for ( final GATKFeature feature : futureFeatures.next() ) {
+                    currentFeatures.add(feature);
+                }
+            }
+        }
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
index a3ce6dd27..cd3403f2f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusView.java
@@ -1,6 +1,6 @@
 package org.broadinstitute.sting.gatk.datasources.providers;
 
-import org.broadinstitute.sting.gatk.DownsampleType;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
 import org.broadinstitute.sting.gatk.ReadProperties;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.iterators.LocusIterator;
@@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View {
 
         // Cache the current and apply filtering.
         AlignmentContext current = nextLocus;
-        if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null )
+
+        // The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling:
+        if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling &&
+            sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) {
+
             current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
+        }
 
         // Indicate that the next operation will need to advance.
         nextLocus = null;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java
index d065635c8..080ac6686 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ManagingReferenceOrderedView.java
@@ -58,7 +58,7 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView {
             // todo -- warning, I removed the reference to the name from states
             bindings.add( state.iterator.seekForward(loc) );
 
-        return new RefMetaDataTracker(bindings, referenceContext);
+        return new RefMetaDataTracker(bindings);
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java
index 01e24df67..40fe03f4a 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedView.java
@@ -23,40 +23,63 @@
 
 package org.broadinstitute.sting.gatk.datasources.providers;
 
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import net.sf.picard.util.PeekableIterator;
 import net.sf.samtools.SAMRecord;
-import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.datasources.reads.ReadShard;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
-import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
-import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
 import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
 import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
 
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
-import java.util.TreeMap;
 
-/** a ROD view for reads. This provides the Read traversals a way of getting a ReadMetaDataTracker */
+/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */
 public class ReadBasedReferenceOrderedView implements View {
-    private final WindowedData window;
-
-    public ReadBasedReferenceOrderedView(ShardDataProvider provider) {
-        window = new WindowedData(provider);
-        provider.register(this);
-    }
+    // a list of the RMDDataState (location->iterators)
+    private final List states = new ArrayList(1);
+    private final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker();
 
     /**
-     * for testing only please
-     *
-     * @param data the window provider
+     * Used to get genome locs for reads
      */
-    ReadBasedReferenceOrderedView(WindowedData data) {
-        window = data;
+    private final GenomeLocParser genomeLocParser;
+
+    /**
+     * The total extent of all reads in this span.  We create iterators from our RODs
+     * from the start of this span, to the end.
+     */
+    private final GenomeLoc shardSpan;
+
+    public ReadBasedReferenceOrderedView(final ShardDataProvider provider) {
+        this.genomeLocParser = provider.getGenomeLocParser();
+        // conditional to optimize the case where we don't have any ROD data
+        this.shardSpan = provider.getReferenceOrderedData() != null ? ((ReadShard)provider.getShard()).getReadsSpan() : null;
+        provider.register(this);
+
+        if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) {
+            for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
+                states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan)));
+        }
     }
 
-    public ReadMetaDataTracker getReferenceOrderedDataForRead(SAMRecord read) {
-        return window.getTracker(read);
+
+    /**
+     * Testing constructor
+     */
+    protected ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser,
+                                            final GenomeLoc shardSpan,
+                                            final List names,
+                                            final List> featureSources) {
+        this.genomeLocParser = genomeLocParser;
+        this.shardSpan = shardSpan;
+        for ( int i = 0; i < names.size(); i++ )
+            states.add(new RMDDataState(names.get(i), featureSources.get(i)));
     }
 
     public Collection> getConflictingViews() {
@@ -65,135 +88,72 @@ public class ReadBasedReferenceOrderedView implements View {
         return classes;
     }
 
-    public void close() {
-        if (window != null) window.close();
-    }
-}
-
-
-/** stores a window of data, dropping RODs if we've passed the new reads start point. */
-class WindowedData {
-    // the queue of possibly in-frame RODs; RODs are removed as soon as they are out of scope
-    private final TreeMap mapping = new TreeMap();
-
-    // our current location from the last read we processed
-    private GenomeLoc currentLoc;
-
-    // a list of the RMDDataState (location->iterators)
-    private List states;
-
-    // the provider; where we get all our information
-    private final ShardDataProvider provider;
-
     /**
-     * our log, which we want to capture anything from this class
-     */
-    private static Logger logger = Logger.getLogger(WindowedData.class);
-
-    /**
-     * create a WindowedData given a shard provider
-     *
-     * @param provider the ShardDataProvider
-     */
-    public WindowedData(ShardDataProvider provider) {
-        this.provider = provider;
-    }
-
-    /**
-     * load the states dynamically, since the only way to get a genome loc is from the read (the shard doesn't have one)
-     *
-     * @param provider the ShardDataProvider
-     * @param rec      the current read
-     */
-    private void getStates(ShardDataProvider provider, SAMRecord rec) {
-
-        int stop = Integer.MAX_VALUE;
-        // figure out the appropriate alignment stop
-        if (provider.hasReference()) {
-            stop = provider.getReference().getSequenceDictionary().getSequence(rec.getReferenceIndex()).getSequenceLength();
-        }
-
-        // calculate the range of positions we need to look at
-        GenomeLoc range = provider.getGenomeLocParser().createGenomeLoc(rec.getReferenceName(),
-                rec.getAlignmentStart(),
-                stop);
-        states = new ArrayList();
-        if (provider.getReferenceOrderedData() != null)
-            for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
-                states.add(new RMDDataState(dataSource, dataSource.seek(range)));
-    }
-
-    /**
-     * this function is for testing only
-     *
-     * @param states a  list of RMDDataState to initialize with
-     */
-    WindowedData(List states) {
-        this.states = states;
-        provider = null;
-    }
-
-    /**
-     * create a ReadMetaDataTracker given the current read
+     * create a RefMetaDataTracker given the current read
      *
      * @param rec the read
      *
-     * @return a ReadMetaDataTracker for the read, from which you can get ROD -> read alignments
+     * @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments
      */
-    public ReadMetaDataTracker getTracker(SAMRecord rec) {
-        updatePosition(rec);
-        return new ReadMetaDataTracker(provider.getGenomeLocParser(), rec, mapping);
+    @Requires("rec != null")
+    @Ensures("result != null")
+    public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) {
+        if ( rec.getReadUnmappedFlag() )
+            // empty RODs for unmapped reads
+            return new RefMetaDataTracker();
+        else
+            return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec));
     }
 
-    /**
-     * update the position we're storing
-     *
-     * @param rec the read to use for start and end
-     */
-    private void updatePosition(SAMRecord rec) {
-        if (states == null) getStates(this.provider, rec);
-        currentLoc = provider.getGenomeLocParser().createGenomeLoc(rec);
-
-        // flush the queue looking for records we've passed over
-        while (mapping.size() > 0 && mapping.firstKey() < currentLoc.getStart())
-            mapping.pollFirstEntry(); // toss away records that we've passed
-
-        // add new data to the queue
-        for (RMDDataState state : states) {
-            // move into position
-            while (state.iterator.hasNext() && state.iterator.peekNextLocation().isBefore(currentLoc))
-                state.iterator.next();
-            while (state.iterator.hasNext() && state.iterator.peekNextLocation().overlapsP(currentLoc)) {
-                RODRecordList list = state.iterator.next();
-                for (GATKFeature datum : list) {
-                    if (!mapping.containsKey(list.getLocation().getStart()))
-                        mapping.put(list.getLocation().getStart(), new RODMetaDataContainer());
-                    mapping.get(list.getLocation().getStart()).addEntry(datum);
-                }
-            }
+    @Requires({"interval != null", "shardSpan == null || shardSpan.isUnmapped() || shardSpan.containsP(interval)"})
+    @Ensures("result != null")
+    public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) {
+        if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers)
+            return EMPTY_TRACKER;
+        else {
+            final List bindings = new ArrayList(states.size());
+            for ( final RMDDataState state : states )
+                bindings.add(state.stream.getOverlapping(interval));
+            return new RefMetaDataTracker(bindings);
         }
     }
 
-    /** Closes the current view. */
+    /**
+     * Closes the current view.
+     */
     public void close() {
-        if (states == null) return;
-        for (RMDDataState state : states)
-            state.dataSource.close( state.iterator );
+        for (final RMDDataState state : states)
+            state.close();
 
         // Clear out the existing data so that post-close() accesses to this data will fail-fast.
-        states = null;
+        states.clear();
     }
 
+    /** Models the traversal state of a given ROD lane. */
+    private static class RMDDataState {
+        public final ReferenceOrderedDataSource dataSource;
+        public final IntervalOverlappingRODsFromStream stream;
+        private final LocationAwareSeekableRODIterator iterator;
 
-}
+        public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
+            this.dataSource = dataSource;
+            this.iterator = iterator;
+            this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator(iterator));
+        }
 
-/** Models the traversal state of a given ROD lane. */
-class RMDDataState {
-    public final ReferenceOrderedDataSource dataSource;
-    public final LocationAwareSeekableRODIterator iterator;
+        /**
+         * For testing
+         */
+        public RMDDataState(final String name, final PeekableIterator iterator) {
+            this.dataSource = null;
+            this.iterator = null;
+            this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator(iterator));
+        }
 
-    public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
-        this.dataSource = dataSource;
-        this.iterator = iterator;
+        public void close() {
+            if ( dataSource != null )
+                dataSource.close( iterator );
+        }
     }
 }
+
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java
index 3d62faf49..5cc8faa0e 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ReadReferenceView.java
@@ -59,16 +59,18 @@ public class ReadReferenceView extends ReferenceView {
         }
 
         public byte[] getBases() {
-//            System.out.printf("Getting bases for location %s%n", loc);
-//            throw new StingException("x");
             return getReferenceBases(loc);
         }
     }
 
-    public ReferenceContext getReferenceContext( SAMRecord read ) {
+    /**
+     * Return a reference context appropriate for the span of read
+     *
+     * @param read the mapped read to test
+     * @return
+     */
+    public ReferenceContext getReferenceContext( final SAMRecord read ) {
         GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
-//        byte[] bases = super.getReferenceBases(loc);
-//        return new ReferenceContext( loc, loc, bases );
         return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) );
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java
index 54f8b44ed..4be7c63c8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/RodLocusView.java
@@ -101,7 +101,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
     public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) {
         // special case the interval again -- add it into the ROD
         if ( interval != null ) { allTracksHere.add(interval); }
-        return new RefMetaDataTracker(allTracksHere, referenceContext);
+        return new RefMetaDataTracker(allTracksHere);
     }
 
     public boolean hasNext() {
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java
index 803bd885b..4279381d7 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ShardDataProvider.java
@@ -94,6 +94,13 @@ public abstract class ShardDataProvider {
         return referenceOrderedData;        
     }
 
+    /**
+     * @return true if reference ordered data will be provided by this shard
+     */
+    public boolean hasReferenceOrderedData() {
+        return ! getReferenceOrderedData().isEmpty();
+    }
+
     /**
      * Create a data provider for the shard given the reads and reference.
      * @param shard The chunk of data over which traversals happen.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
index ebfef5dc1..8ee7e0439 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/BAMScheduler.java
@@ -124,7 +124,24 @@ public class BAMScheduler implements Iterator {
      */
     private FilePointer generatePointerOverEntireFileset() {
         FilePointer filePointer = new FilePointer();
-        Map currentPosition = dataSource.getCurrentPosition();
+
+        // This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
+        // the only FilePointer we will create. This allows us to have this FilePointer represent regions from
+        // multiple contigs
+        filePointer.setIsMonolithic(true);
+
+        Map currentPosition;
+
+        // Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling
+        // TODO: clean this up once the experimental downsampling engine fork collapses
+        if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) {
+            currentPosition = dataSource.getInitialReaderPositions();
+        }
+        else {
+            currentPosition = dataSource.getCurrentPosition();
+
+        }
+
         for(SAMReaderID reader: dataSource.getReaderIDs())
             filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
         return filePointer;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java
new file mode 100644
index 000000000..0440c7eae
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancer.java
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.datasources.reads;
+
+import net.sf.picard.util.PeekableIterator;
+import net.sf.samtools.SAMRecord;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+
+import java.util.*;
+
+/**
+ * Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards.
+ *
+ * When processing FilePointers, our strategy is to aggregate all FilePointers for each contig
+ * together into one monolithic FilePointer, create one persistent set of read iterators over
+ * that monolithic FilePointer, and repeatedly use that persistent set of read iterators to
+ * fill read shards with reads.
+ *
+ * This strategy has several important advantages:
+ *
+ * 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole
+ *    contig will have regions that overlap with other FilePointers on the same contig, due
+ *    to the limited granularity of BAM index data. By creating only one FilePointer per contig,
+ *    we avoid having to track how much of each file region we've visited (as we did in the
+ *    former implementation), we avoid expensive non-sequential access patterns in the files,
+ *    and we avoid having to repeatedly re-create our iterator chain for every small region
+ *    of interest.
+ *
+ * 2. We avoid boundary issues with the engine-level downsampling. Since we create a single
+ *    persistent set of read iterators (which include the downsampling iterator(s)) per contig,
+ *    the downsampling process is never interrupted by FilePointer or Shard boundaries, and never
+ *    loses crucial state information while downsampling within a contig.
+ *
+ * TODO: There is also at least one important disadvantage:
+ *
+ * 1. We load more BAM index data into memory at once, and this work is done upfront before processing
+ *    the next contig, creating a delay before traversal of each contig. This delay may be
+ *    compensated for by the gains listed in #1 above, and we may be no worse off overall in
+ *    terms of total runtime, but we need to verify this empirically.
+ *
+ * @author David Roazen
+ */
+public class ExperimentalReadShardBalancer extends ShardBalancer {
+
+    private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class);
+
+    /**
+     * Convert iterators of file pointers into balanced iterators of shards.
+     * @return An iterator over balanced shards.
+     */
+    public Iterator iterator() {
+        return new Iterator() {
+            /**
+             * The cached shard to be returned next.  Prefetched in the peekable iterator style.
+             */
+            private Shard nextShard = null;
+
+            /**
+             * The file pointer currently being processed.
+             */
+            private FilePointer currentContigFilePointer = null;
+
+            /**
+             * Iterator over the reads from the current contig's file pointer. The same iterator will be
+             * used to fill all shards associated with a given file pointer
+             */
+            private PeekableIterator currentContigReadsIterator = null;
+
+            /**
+             * How many FilePointers have we pulled from the filePointers iterator?
+             */
+            private int totalFilePointersConsumed = 0;
+
+            /**
+             * Have we encountered a monolithic FilePointer?
+             */
+            private boolean encounteredMonolithicFilePointer = false;
+
+
+            {
+                createNextContigFilePointer();
+                advance();
+            }
+
+            public boolean hasNext() {
+                return nextShard != null;
+            }
+
+            public Shard next() {
+                if ( ! hasNext() )
+                    throw new NoSuchElementException("No next read shard available");
+                Shard currentShard = nextShard;
+                advance();
+                return currentShard;
+            }
+
+            private void advance() {
+                nextShard = null;
+
+                // May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away
+                while ( nextShard == null && currentContigFilePointer != null ) {
+
+                    // If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one):
+                    if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) {
+
+                        // Close the old, exhausted chain of iterators to release resources
+                        currentContigReadsIterator.close();
+
+                        // Advance to the FilePointer for the next contig
+                        createNextContigFilePointer();
+
+                        // We'll need to create a fresh iterator for this file pointer when we create the first
+                        // shard for it below.
+                        currentContigReadsIterator = null;
+                    }
+
+                    // At this point our currentContigReadsIterator may be null or non-null depending on whether or not
+                    // this is our first shard for this file pointer.
+                    if ( currentContigFilePointer != null ) {
+                        Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped);
+
+                        // Create a new reads iterator only when we've just advanced to the file pointer for the next
+                        // contig. It's essential that the iterators persist across all shards that share the same contig
+                        // to allow the downsampling to work properly.
+                        if ( currentContigReadsIterator == null ) {
+                            currentContigReadsIterator = new PeekableIterator(readsDataSource.getIterator(shard));
+                        }
+
+                        if ( currentContigReadsIterator.hasNext() ) {
+                            shard.fill(currentContigReadsIterator);
+                            nextShard = shard;
+                        }
+                    }
+                }
+            }
+
+            /**
+             * Aggregate all FilePointers for the next contig together into one monolithic FilePointer
+             * to avoid boundary issues with visiting the same file regions more than once (since more
+             * granular FilePointers will have regions that overlap with other nearby FilePointers due
+             * to the nature of BAM indices).
+             *
+             * By creating one persistent set of iterators per contig we also avoid boundary artifacts
+             * in the engine-level downsampling.
+             *
+             * TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for
+             * TODO: read traversals, as there's little point in the BAMSchedule emitting extremely
+             * TODO: granular FilePointers if we're just going to union them. The BAMSchedule should
+             * TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for
+             * TODO: locus traversals).
+             */
+            private void createNextContigFilePointer() {
+                currentContigFilePointer = null;
+                List nextContigFilePointers = new ArrayList();
+
+                logger.info("Loading BAM index data for next contig");
+
+                while ( filePointers.hasNext() ) {
+
+                    // Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
+                    // it is the ONLY FilePointer we ever encounter
+                    if ( encounteredMonolithicFilePointer ) {
+                        throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
+                    }
+                    if ( filePointers.peek().isMonolithic() ) {
+                        if ( totalFilePointersConsumed > 0 ) {
+                            throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
+                        }
+                        encounteredMonolithicFilePointer = true;
+                        logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
+                    }
+
+                    // If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
+                    // same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
+                    if ( nextContigFilePointers.isEmpty() ||
+                             (! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped &&
+                             nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) ||
+                                 (nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
+
+                        nextContigFilePointers.add(filePointers.next());
+                        totalFilePointersConsumed++;
+                    }
+                    else {
+                        break; // next FilePointer is on a different contig or has different mapped/unmapped status,
+                               // save it for next time
+                    }
+                }
+
+                if ( ! nextContigFilePointers.isEmpty() ) {
+                    currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser);
+                }
+
+                if ( currentContigFilePointer != null ) {
+                    logger.info("Done loading BAM index data for next contig");
+                    logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer));
+                }
+            }
+
+            public void remove() {
+                throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
+            }
+        };
+    }
+
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java
index df7827250..197015641 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/FilePointer.java
@@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk.datasources.reads;
 
 import net.sf.picard.util.PeekableIterator;
 import net.sf.samtools.GATKBAMFileSpan;
+import net.sf.samtools.GATKChunk;
 import net.sf.samtools.SAMFileSpan;
+import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.Utils;
@@ -48,18 +50,87 @@ public class FilePointer {
      */
     protected final boolean isRegionUnmapped;
 
-    public FilePointer(final GenomeLoc... locations) {
-        this.locations.addAll(Arrays.asList(locations));
+    /**
+     * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
+     * ever visit during this GATK run? If this is set to true, the engine will expect to see only this
+     * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
+     * from more than one contig.
+     */
+    private boolean isMonolithic = false;
+
+    /**
+     * Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers
+     */
+    private Integer contigIndex = null;
+
+
+    public FilePointer( List locations ) {
+        this.locations.addAll(locations);
+        this.isRegionUnmapped = checkUnmappedStatus();
+
+        validateAllLocations();
+        if ( locations.size() > 0 ) {
+            contigIndex = locations.get(0).getContigIndex();
+        }
+    }
+
+    public FilePointer( final GenomeLoc... locations ) {
+        this(Arrays.asList(locations));
+    }
+
+    public FilePointer( Map fileSpans, List locations ) {
+        this(locations);
+        this.fileSpans.putAll(fileSpans);
+    }
+
+    private boolean checkUnmappedStatus() {
         boolean foundMapped = false, foundUnmapped = false;
-        for(GenomeLoc location: locations) {
-            if(GenomeLoc.isUnmapped(location))
+
+        for( GenomeLoc location: locations ) {
+            if ( GenomeLoc.isUnmapped(location) )
                 foundUnmapped = true;
             else
                 foundMapped = true;
         }
-        if(foundMapped && foundUnmapped)
+        if ( foundMapped && foundUnmapped )
             throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
-        this.isRegionUnmapped = foundUnmapped;
+
+        return foundUnmapped;
+    }
+
+    private void validateAllLocations() {
+        // Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
+        if ( isRegionUnmapped || isMonolithic ) {
+            return;
+        }
+
+        Integer previousContigIndex = null;
+
+        for ( GenomeLoc location : locations ) {
+            if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) {
+                throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
+            }
+
+            previousContigIndex = location.getContigIndex();
+        }
+    }
+
+    private void validateLocation( GenomeLoc location ) {
+        if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) {
+            throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
+        }
+        if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) {
+            throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
+        }
+    }
+
+    /**
+     * Returns an immutable view of this FilePointer's file spans
+     *
+     * @return an immutable view of this FilePointer's file spans
+     */
+    public Map getFileSpans() {
+        return Collections.unmodifiableMap(fileSpans);
     }
 
     /**
@@ -70,6 +141,39 @@ public class FilePointer {
         return Collections.unmodifiableList(locations);
     }
 
+    /**
+     * Returns the index of the contig into which this FilePointer points (a FilePointer can represent
+     * regions in at most one contig).
+     *
+     * @return the index of the contig into which this FilePointer points
+     */
+    public int getContigIndex() {
+        return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
+    }
+
+    /**
+     * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
+     * ever visit during this GATK run? If this is set to true, the engine will expect to see only this
+     * one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
+     * from more than one contig.
+     *
+     * @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
+     */
+    public boolean isMonolithic() {
+        return isMonolithic;
+    }
+
+    /**
+     * Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
+     * regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
+     * FP may contain intervals from more than one contig.
+     *
+     * @param isMonolithic set this FP's monolithic status to this value
+     */
+    public void setIsMonolithic( boolean isMonolithic ) {
+        this.isMonolithic = isMonolithic;
+    }
+
     @Override
     public boolean equals(final Object other) {
         if(!(other instanceof FilePointer))
@@ -98,7 +202,12 @@ public class FilePointer {
     }
 
     public void addLocation(final GenomeLoc location) {
-        locations.add(location);
+        validateLocation(location);
+
+        this.locations.add(location);
+        if ( contigIndex == null ) {
+            contigIndex = location.getContigIndex();
+        }
     }
 
     public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) {
@@ -216,6 +325,84 @@ public class FilePointer {
         combined.addFileSpans(initialElement.getKey(),fileSpan);
     }
 
+    /**
+     * Efficiently generate the union of the n FilePointers passed in. Much more efficient than
+     * combining two FilePointers at a time using the combine() method above.
+     *
+     * IMPORTANT: the FilePointers to be unioned must either all represent regions on the
+     * same contig, or all be unmapped, since we cannot create FilePointers with a mix of
+     * contigs or with mixed mapped/unmapped regions.
+     *
+     * @param filePointers the FilePointers to union
+     * @param parser our GenomeLocParser
+     * @return the union of the FilePointers passed in
+     */
+    public static FilePointer union( List filePointers, GenomeLocParser parser ) {
+        if ( filePointers == null || filePointers.isEmpty() ) {
+            return new FilePointer();
+        }
+
+        Map> fileChunks = new HashMap>();
+        List locations = new ArrayList();
+
+        // First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections
+        for ( FilePointer filePointer : filePointers ) {
+            locations.addAll(filePointer.getLocations());
+
+            for ( Map.Entry fileSpanEntry : filePointer.getFileSpans().entrySet() ) {
+                GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue();
+
+                if ( fileChunks.containsKey(fileSpanEntry.getKey()) ) {
+                    fileChunks.get(fileSpanEntry.getKey()).addAll(fileSpan.getGATKChunks());
+                }
+                else {
+                    fileChunks.put(fileSpanEntry.getKey(), fileSpan.getGATKChunks());
+                }
+            }
+        }
+
+        // Now sort and merge the intervals
+        List sortedMergedLocations = new ArrayList();
+        sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, IntervalMergingRule.ALL));
+
+        // For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing
+        // the sorted, merged union of the chunks for that file
+        Map mergedFileSpans = new HashMap(fileChunks.size());
+        for ( Map.Entry> fileChunksEntry : fileChunks.entrySet() ) {
+            List unmergedChunks = fileChunksEntry.getValue();
+            mergedFileSpans.put(fileChunksEntry.getKey(),
+                                (new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan()));
+        }
+
+        return new FilePointer(mergedFileSpans, sortedMergedLocations);
+    }
+
+    /**
+     * Returns true if any of the file spans in this FilePointer overlap their counterparts in
+     * the other FilePointer. "Overlap" is defined as having an overlapping extent (the region
+     * from the start of the first chunk to the end of the last chunk).
+     *
+     * @param other the FilePointer against which to check overlap with this FilePointer
+     * @return true if any file spans overlap their counterparts in other, otherwise false
+     */
+    public boolean hasFileSpansOverlappingWith( FilePointer other ) {
+        for ( Map.Entry thisFilePointerEntry : fileSpans.entrySet() ) {
+            GATKBAMFileSpan thisFileSpan = new GATKBAMFileSpan(thisFilePointerEntry.getValue());
+
+            SAMFileSpan otherEntry = other.fileSpans.get(thisFilePointerEntry.getKey());
+            if ( otherEntry == null ) {
+                continue;  // no counterpart for this file span in other
+            }
+            GATKBAMFileSpan otherFileSpan = new GATKBAMFileSpan(otherEntry);
+
+            if ( thisFileSpan.getExtent().overlaps(otherFileSpan.getExtent()) ) {
+                return true;
+            }
+        }
+
+        return false;
+    }
+
     @Override
     public String toString() {
         StringBuilder builder = new StringBuilder();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
index f78693c27..cc0a371ea 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/IntervalSharder.java
@@ -73,8 +73,15 @@ public class IntervalSharder implements Iterator {
      */
     public FilePointer next() {
         FilePointer current = wrappedIterator.next();
-        while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
+
+        while ( wrappedIterator.hasNext() &&
+                current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped &&
+                (current.getContigIndex() == wrappedIterator.peek().getContigIndex() || current.isRegionUnmapped) &&
+                current.minus(wrappedIterator.peek()) == 0 ) {
+
             current = current.combine(parser,wrappedIterator.next());
+        }
+
         return current;
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java
index 585b63457..e1bf2d98e 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/LocusShardBalancer.java
@@ -42,8 +42,10 @@ public class LocusShardBalancer extends ShardBalancer {
 
             public Shard next() {
                 FilePointer current = filePointers.next();
-                while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
-                    current = current.combine(parser,filePointers.next());
+
+                // FilePointers have already been combined as necessary at the IntervalSharder level. No
+                // need to do so again here.
+
                 return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
             }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
index 96b55674a..27e666f6f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShard.java
@@ -1,16 +1,15 @@
 package org.broadinstitute.sting.gatk.datasources.reads;
 
-import net.sf.samtools.SAMFileSpan;
-import net.sf.samtools.SAMRecord;
+import net.sf.picard.util.PeekableIterator;
+import net.sf.samtools.*;
+import net.sf.samtools.util.CloseableIterator;
 import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
 import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
+import java.util.*;
 
 /**
  *
@@ -35,10 +34,21 @@ import java.util.Map;
  * @version 0.1
  */
 public class ReadShard extends Shard {
+
+    /**
+     * Default read shard buffer size
+     */
+    public static final int DEFAULT_MAX_READS = 10000;
+
     /**
      * What is the maximum number of reads per BAM file which should go into a read shard.
+     *
+     * TODO: this non-final static variable should either be made final or turned into an
+     * TODO: instance variable somewhere -- as both static and mutable it wreaks havoc
+     * TODO: with tests that use multiple instances of SAMDataSource (since SAMDataSource
+     * TODO: changes this value)
      */
-    public static int MAX_READS = 10000;
+    public static int MAX_READS = DEFAULT_MAX_READS;
 
     /**
      * The reads making up this shard.
@@ -52,12 +62,24 @@ public class ReadShard extends Shard {
     /**
      * Sets the maximum number of reads buffered in a read shard.  Implemented as a weirdly static interface
      * until we know what effect tuning this parameter has.
+     *
+     * TODO: this mutable static interface is awful and breaks tests -- need to refactor
+     *
      * @param bufferSize New maximum number
      */
     static void setReadBufferSize(final int bufferSize) {
         MAX_READS = bufferSize;
     }
 
+    /**
+     * What read buffer size are we using?
+     *
+     * @return
+     */
+    public static int getReadBufferSize() {
+        return MAX_READS;
+    }
+
     /**
      * Returns true if this shard is meant to buffer reads, rather
      * than just holding pointers to their locations.
@@ -93,6 +115,67 @@ public class ReadShard extends Shard {
         reads.add(read);
     }
 
+    /**
+     * Fills this shard's buffer with reads from the iterator passed in
+     *
+     * @param readIter Iterator from which to draw the reads to fill the shard
+     */
+    @Override
+    public void fill( PeekableIterator readIter ) {
+        if( ! buffersReads() )
+            throw new ReviewedStingException("Attempting to fill a non-buffering shard.");
+
+        SAMFileHeader.SortOrder sortOrder = getReadProperties().getSortOrder();
+        SAMRecord read = null;
+
+        while( ! isBufferFull() && readIter.hasNext() ) {
+            final SAMRecord nextRead = readIter.peek();
+            if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) {
+                // only add reads to the shard if they are on the same contig
+                read = readIter.next();
+                addRead(read);
+            } else {
+                break;
+            }
+        }
+
+        // If the reads are sorted in coordinate order, ensure that all reads
+        // having the same alignment start become part of the same shard, to allow
+        // downsampling to work better across shard boundaries. Note that because our
+        // read stream has already been fed through the positional downsampler, which
+        // ensures that at each alignment start position there are no more than dcov
+        // reads, we're in no danger of accidentally creating a disproportionately huge
+        // shard
+        if ( sortOrder == SAMFileHeader.SortOrder.coordinate ) {
+            while ( readIter.hasNext() ) {
+                SAMRecord additionalRead = readIter.peek();
+
+                // Stop filling the shard as soon as we encounter a read having a different
+                // alignment start or contig from the last read added in the earlier loop
+                // above, or an unmapped read
+                if ( read == null ||
+                     additionalRead.getReadUnmappedFlag() ||
+                     ! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) ||
+                     additionalRead.getAlignmentStart() != read.getAlignmentStart() ) {
+                    break;
+                }
+
+                addRead(readIter.next());
+            }
+        }
+
+        // If the reads are sorted in queryname order, ensure that all reads
+        // having the same queryname become part of the same shard.
+        if( sortOrder == SAMFileHeader.SortOrder.queryname ) {
+            while( readIter.hasNext() ) {
+                SAMRecord nextRead = readIter.peek();
+                if( read == null || ! read.getReadName().equals(nextRead.getReadName()) )
+                    break;
+                addRead(readIter.next());
+            }
+        }
+    }
+
     /**
      * Creates an iterator over reads stored in this shard's read cache.
      * @return
@@ -116,4 +199,48 @@ public class ReadShard extends Shard {
         }
         return sb.toString();
     }
+
+    /**
+     * Get the full span from the start of the left most read to the end of the right most one
+     *
+     * Note this may be different than the getLocation() of the shard, as this reflects the
+     * targeted span, not the actual span of reads
+     *
+     * @return the genome loc representing the span of these reads on the genome
+     */
+    public GenomeLoc getReadsSpan() {
+        if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() )
+            return super.getLocation();
+        else {
+            int start = Integer.MAX_VALUE;
+            int stop = Integer.MIN_VALUE;
+            String contig = null;
+            boolean foundMapped = false;
+
+            for ( final SAMRecord read : reads ) {
+                if ( contig != null && ! read.getReferenceName().equals(contig) )
+                    throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
+                            + "First contig is " + contig + " next read was " + read.getReferenceName() );
+                contig = read.getReferenceName();
+
+                // Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates
+                // of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries,
+                // this shard might consist *only* of unmapped mates! We need to refrain from using the alignment
+                // starts/stops of these unmapped mates, and detect the case where the shard has been filled *only*
+                // with unmapped mates.
+                if ( ! read.getReadUnmappedFlag() ) {
+                    foundMapped = true;
+                    if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
+                    if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
+                }
+            }
+
+            assert contig != null;
+
+            if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped
+                return GenomeLoc.UNMAPPED;
+            else
+                return parser.createGenomeLoc(contig, start, stop);
+        }
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java
index 311c7874f..18fafb95d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ReadShardBalancer.java
@@ -34,6 +34,8 @@ import java.util.NoSuchElementException;
 
 /**
  * Divide up large file pointers containing reads into more manageable subcomponents.
+ *
+ * TODO: delete this class once the experimental downsampling engine fork collapses
  */
 public class ReadShardBalancer extends ShardBalancer {
     /**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
index 7f0a0c4c0..bb788c89f 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java
@@ -24,14 +24,15 @@
 
 package org.broadinstitute.sting.gatk.datasources.reads;
 
-import net.sf.picard.reference.IndexedFastaSequenceFile;
 import net.sf.picard.sam.MergingSamRecordIterator;
 import net.sf.picard.sam.SamFileHeaderMerger;
 import net.sf.samtools.*;
 import net.sf.samtools.util.CloseableIterator;
 import net.sf.samtools.util.RuntimeIOException;
 import org.apache.log4j.Logger;
-import org.broadinstitute.sting.gatk.DownsamplingMethod;
+import org.broadinstitute.sting.gatk.downsampling.*;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
+import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
 import org.broadinstitute.sting.gatk.ReadMetrics;
 import org.broadinstitute.sting.gatk.ReadProperties;
 import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@@ -42,12 +43,9 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.GenomeLocSortedSet;
 import org.broadinstitute.sting.utils.SimpleTimer;
-import org.broadinstitute.sting.utils.baq.BAQ;
-import org.broadinstitute.sting.utils.baq.BAQSamIterator;
+import org.broadinstitute.sting.utils.baq.ReadTransformingIterator;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
-import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator;
-import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
 import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
 
 import java.io.File;
@@ -101,6 +99,8 @@ public class SAMDataSource {
 
     /**
      * How far along is each reader?
+     *
+     * TODO: delete this once the experimental downsampling engine fork collapses
      */
     private final Map readerPositions = new HashMap();
 
@@ -200,11 +200,8 @@ public class SAMDataSource {
                 downsamplingMethod,
                 exclusionList,
                 supplementalFilters,
+                Collections.emptyList(),
                 includeReadsWithDeletionAtLoci,
-                BAQ.CalculationMode.OFF,
-                BAQ.QualityMode.DONT_MODIFY,
-                null, // no BAQ
-                null, // no BQSR
                 (byte) -1,
                 false);
     }
@@ -234,11 +231,8 @@ public class SAMDataSource {
             DownsamplingMethod downsamplingMethod,
             ValidationExclusion exclusionList,
             Collection supplementalFilters,
+            List readTransformers,
             boolean includeReadsWithDeletionAtLoci,
-            BAQ.CalculationMode cmode,
-            BAQ.QualityMode qmode,
-            IndexedFastaSequenceFile refReader,
-            BaseRecalibration bqsrApplier,
             byte defaultBaseQualities,
             boolean removeProgramRecords) {
         this.readMetrics = new ReadMetrics();
@@ -258,11 +252,11 @@ public class SAMDataSource {
         validationStringency = strictness;
         this.removeProgramRecords = removeProgramRecords;
         if(readBufferSize != null)
-            ReadShard.setReadBufferSize(readBufferSize);
+            ReadShard.setReadBufferSize(readBufferSize);   // TODO: use of non-final static variable here is just awful, especially for parallel tests
         else {
             // Choose a sensible default for the read buffer size.  For the moment, we're picking 1000 reads per BAM per shard (which effectively
             // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once.
-            ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000));
+            ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000));
         }
 
         resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
@@ -303,16 +297,14 @@ public class SAMDataSource {
         readProperties = new ReadProperties(
                 samFiles,
                 mergedHeader,
+                sortOrder,
                 useOriginalBaseQualities,
                 strictness,
                 downsamplingMethod,
                 exclusionList,
                 supplementalFilters,
+                readTransformers,
                 includeReadsWithDeletionAtLoci,
-                cmode,
-                qmode,
-                refReader,
-                bqsrApplier,
                 defaultBaseQualities);
 
         // cache the read group id (original) -> read group id (merged)
@@ -388,7 +380,10 @@ public class SAMDataSource {
     /**
      * Retrieves the current position within the BAM file.
      * @return A mapping of reader to current position.
+     *
+     * TODO: delete this once the experimental downsampling engine fork collapses
      */
+    @Deprecated
     public Map getCurrentPosition() {
         return readerPositions;
     }
@@ -471,9 +466,15 @@ public class SAMDataSource {
     }
 
     /**
-     * Fill the given buffering shard with reads.
+     * Legacy method to fill the given buffering shard with reads.
+     *
+     * Shard.fill() is used instead of this method when experimental downsampling is enabled
+     *
+     * TODO: delete this method once the experimental downsampling engine fork collapses
+     *
      * @param shard Shard to fill.
      */
+    @Deprecated
     public void fillShard(Shard shard) {
         if(!shard.buffersReads())
             throw new ReviewedStingException("Attempting to fill a non-buffering shard.");
@@ -486,9 +487,15 @@ public class SAMDataSource {
 
         CloseableIterator iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate);
         while(!shard.isBufferFull() && iterator.hasNext()) {
-            read = iterator.next();
-            shard.addRead(read);
-            noteFilePositionUpdate(positionUpdates,read);
+            final SAMRecord nextRead = iterator.next();
+            if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) {
+                // only add reads to the shard if they are on the same contig
+                read = nextRead;
+                shard.addRead(read);
+                noteFilePositionUpdate(positionUpdates,read);
+            } else {
+                break;
+            }
         }
 
         // If the reads are sorted in queryname order, ensure that all reads
@@ -510,6 +517,10 @@ public class SAMDataSource {
             readerPositions.put(readers.getReaderID(positionUpdate.getKey()),positionUpdate.getValue());
     }
 
+    /*
+     * TODO: delete this method once the experimental downsampling engine fork collapses
+     */
+    @Deprecated
     private void noteFilePositionUpdate(Map positionMapping, SAMRecord read) {
         GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing());
         positionMapping.put(read.getFileSource().getReader(),endChunk);
@@ -520,8 +531,7 @@ public class SAMDataSource {
             return shard.iterator();
         }
         else {
-            SAMReaders readers = resourcePool.getAvailableReaders();
-            return getIterator(readers,shard,shard instanceof ReadShard);
+            return getIterator(shard);
         }
     }
 
@@ -541,13 +551,44 @@ public class SAMDataSource {
 
     /**
      * Initialize the current reader positions
+     *
+     * TODO: delete this once the experimental downsampling engine fork collapses
+     *
      * @param readers
      */
+    @Deprecated
     private void initializeReaderPositions(SAMReaders readers) {
         for(SAMReaderID id: getReaderIDs())
             readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
     }
 
+    /**
+     * Get the initial reader positions across all BAM files
+     *
+     * @return the start positions of the first chunk of reads for all BAM files
+     */
+    public Map getInitialReaderPositions() {
+        Map initialPositions = new HashMap();
+        SAMReaders readers = resourcePool.getAvailableReaders();
+
+        for ( SAMReaderID id: getReaderIDs() ) {
+            initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
+        }
+
+        resourcePool.releaseReaders(readers);
+        return initialPositions;
+    }
+
+    /**
+     * Get an iterator over the data types specified in the shard.
+     *
+     * @param shard The shard specifying the data limits.
+     * @return An iterator over the selected data.
+     */
+    public StingSAMIterator getIterator( Shard shard ) {
+        return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard);
+    }
+
     /**
      * Get an iterator over the data types specified in the shard.
      * @param readers Readers from which to load data.
@@ -585,6 +626,7 @@ public class SAMDataSource {
             iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
             if(shard.getGenomeLocs().size() > 0)
                 iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
+
             iteratorMap.put(readers.getReader(id), iterator);
         }
 
@@ -597,10 +639,7 @@ public class SAMDataSource {
                 readProperties.getDownsamplingMethod().toFraction,
                 readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
                 readProperties.getSupplementalFilters(),
-                readProperties.getBAQCalculationMode(),
-                readProperties.getBAQQualityMode(),
-                readProperties.getRefReader(),
-                readProperties.getBQSRApplier(),
+                readProperties.getReadTransformers(),
                 readProperties.defaultBaseQualities());
     }
 
@@ -667,40 +706,62 @@ public class SAMDataSource {
                                                         Double downsamplingFraction,
                                                         Boolean noValidationOfReadOrder,
                                                         Collection supplementalFilters,
-                                                        BAQ.CalculationMode cmode,
-                                                        BAQ.QualityMode qmode,
-                                                        IndexedFastaSequenceFile refReader,
-                                                        BaseRecalibration bqsrApplier,
+                                                        List readTransformers,
                                                         byte defaultBaseQualities) {
 
-        // *********************************************************************************** //
-        // *  NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
-        // *     (otherwise we will process something that we may end up throwing away)      * //
-        // *********************************************************************************** //
+        // ************************************************************************************************ //
+        // *  NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
+        // *     (otherwise we will process something that we may end up throwing away)                   * //
+        // ************************************************************************************************ //
 
-        if (downsamplingFraction != null)
-            wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
+        wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
+
+        if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) {
+            wrappedIterator = applyDownsamplingIterator(wrappedIterator);
+        }
+
+        // Use the old fractional downsampler only if we're not using experimental downsampling:
+        if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null )
+            wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction);
 
         // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
         // verify the read ordering by applying a sort order iterator
         if (!noValidationOfReadOrder && enableVerification)
-            wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
-
-        wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
+            wrappedIterator = new VerifyingSamIterator(wrappedIterator);
 
         if (useOriginalBaseQualities || defaultBaseQualities >= 0)
             // only wrap if we are replacing the original qualities or using a default base quality
             wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
 
-        if (bqsrApplier != null)
-            wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier);
-
-        if (cmode != BAQ.CalculationMode.OFF)
-            wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode);
+        // set up read transformers
+        for ( final ReadTransformer readTransformer : readTransformers ) {
+            if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT )
+                wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer);
+        }
 
         return wrappedIterator;
     }
 
+    protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) {
+        if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
+            ReadsDownsamplerFactory downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ?
+                                                                    new SimplePositionalDownsamplerFactory(readProperties.getDownsamplingMethod().toCoverage) :
+                                                                    new FractionalDownsamplerFactory(readProperties.getDownsamplingMethod().toFraction);
+
+            return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory);
+        }
+        else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
+            ReadsDownsampler downsampler = readProperties.getDownsamplingMethod().toCoverage != null ?
+                                                      new SimplePositionalDownsampler(readProperties.getDownsamplingMethod().toCoverage) :
+                                                      new FractionalDownsampler(readProperties.getDownsamplingMethod().toFraction);
+
+            return new DownsamplingReadsIterator(wrappedIterator, downsampler);
+        }
+
+        return wrappedIterator;
+    }
+
+
     private class SAMResourcePool {
         /**
          * How many entries can be cached in this resource pool?
@@ -947,6 +1008,12 @@ public class SAMDataSource {
             } catch ( SAMFormatException e ) {
                 throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
             }
+            // Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files).
+            // Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case,
+            // just in case we want to change this behavior later.
+            catch ( RuntimeException e ) {
+                throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
+            }
             reader.setSAMRecordFactory(factory);
             reader.enableFileSource(true);
             reader.setValidationStringency(validationStringency);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java
index f8d941784..e22a7a54d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java
@@ -1,5 +1,6 @@
 package org.broadinstitute.sting.gatk.datasources.reads;
 
+import net.sf.picard.util.PeekableIterator;
 import net.sf.samtools.SAMFileSpan;
 import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.ReadMetrics;
@@ -203,6 +204,12 @@ public abstract class Shard implements HasGenomeLocation {
      */
     public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); }
 
+    /**
+     * Fills the shard with reads. Can only do this with shards that buffer reads
+     * @param readIter Iterator from which to draw the reads to fill the shard
+     */
+    public void fill( PeekableIterator readIter ) { throw new UnsupportedOperationException("This shard does not buffer reads."); }
+
     /**
      * Gets the iterator over the elements cached in the shard.
      * @return
diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java
index 5b4be2fc6..664d96321 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/rmd/ReferenceOrderedDataSource.java
@@ -34,8 +34,10 @@ import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
 
 import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.lang.reflect.Type;
 import java.util.List;
@@ -239,6 +241,8 @@ class ReferenceOrderedQueryDataPool extends ResourcePool {
 
-    /*
-     * Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
+    /**
+     * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine
      * immediately whether the item survives the downsampling process, while others will need to see
      * more items before making that determination.
+     *
+     * @param item the individual item to submit to the downsampler for consideration
      */
     public void submit( T item );
 
-    /*
-     * Submit a collection of items to the downsampler for consideration.
+    /**
+     * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling
+     * submit() on each individual item in the collection.
+     *
+     * @param items the collection of items to submit to the downsampler for consideration
      */
     public void submit( Collection items );
 
-    /*
+    /**
      * Are there items that have survived the downsampling process waiting to be retrieved?
+     *
+     * @return true if this downsampler has > 0 finalized items, otherwise false
      */
-    public boolean hasDownsampledItems();
+    public boolean hasFinalizedItems();
 
-    /*
-     * Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
+    /**
+     * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved.
+     *
+     * @return a list of all finalized items this downsampler contains, or an empty list if there are none
      */
-    public List consumeDownsampledItems();
+    public List consumeFinalizedItems();
 
-    /*
+    /**
      * Are there items stored in this downsampler that it doesn't yet know whether they will
      * ultimately survive the downsampling process?
+     *
+     * @return true if this downsampler has > 0 pending items, otherwise false
      */
     public boolean hasPendingItems();
 
-    /*
+    /**
+     * Peek at the first finalized item stored in this downsampler (or null if there are no finalized items)
+     *
+     * @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call),
+     *         or null if there are none
+     */
+    public T peekFinalized();
+
+    /**
+     * Peek at the first pending item stored in this downsampler (or null if there are no pending items)
+     *
+     * @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call),
+     *         or null if there are none
+     */
+    public T peekPending();
+
+    /**
+     * Returns the number of items discarded (so far) during the downsampling process
+     *
+     * @return the number of items that have been submitted to this downsampler and discarded in the process of
+     *         downsampling
+     */
+    public int getNumberOfDiscardedItems();
+
+    /**
      * Used to tell the downsampler that no more items will be submitted to it, and that it should
      * finalize any pending items.
      */
     public void signalEndOfInput();
 
-    /*
-     * Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
-     * information.
+    /**
+     * Empty the downsampler of all finalized/pending items
      */
     public void clear();
+
+    /**
+     * Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items
+     */
+    public void reset();
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java
new file mode 100644
index 000000000..ae1d98ce0
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingMethod.java
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.downsampling;
+
+import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
+import org.broadinstitute.sting.gatk.walkers.LocusWalker;
+import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+
+/**
+ * Describes the method for downsampling reads at a given locus.
+ */
+
+public class DownsamplingMethod {
+    /**
+     * Type of downsampling to perform.
+     */
+    public final DownsampleType type;
+
+    /**
+     * Actual downsampling target is specified as an integer number of reads.
+     */
+    public final Integer toCoverage;
+
+    /**
+     * Actual downsampling target is specified as a fraction of total available reads.
+     */
+    public final Double toFraction;
+
+    /**
+     * Use the new experimental downsampling?
+     */
+    public final boolean useExperimentalDownsampling;
+
+    /**
+     * Expresses no downsampling applied at all.
+     */
+    public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false);
+
+    /**
+     * Default type to use if no type is specified
+     */
+    public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
+
+    /**
+     * Default target coverage for locus-based traversals
+     */
+    public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000;
+
+    public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) {
+        this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE;
+        this.toCoverage = toCoverage;
+        this.toFraction = toFraction;
+        this.useExperimentalDownsampling = useExperimentalDownsampling;
+
+        if ( type == DownsampleType.NONE ) {
+            toCoverage = null;
+            toFraction = null;
+        }
+
+        validate();
+    }
+
+    private void validate() {
+        // Can't leave toFraction and toCoverage null unless type is NONE
+        if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null )
+            throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
+
+        // Fraction and coverage cannot both be specified.
+        if ( toFraction != null && toCoverage != null )
+            throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified.  Please choose only one.");
+
+        // toCoverage must be > 0 when specified
+        if ( toCoverage != null && toCoverage <= 0 ) {
+            throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage");
+        }
+
+        // toFraction must be >= 0.0 and <= 1.0 when specified
+        if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) {
+            throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads");
+        }
+
+        // Some restrictions only exist for the old downsampling implementation:
+        if ( ! useExperimentalDownsampling ) {
+            // By sample downsampling does not work with a fraction of reads in the old downsampling implementation
+            if( type == DownsampleType.BY_SAMPLE && toFraction != null )
+                throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method");
+        }
+
+        // Some restrictions only exist for the new downsampling implementation:
+        if ( useExperimentalDownsampling ) {
+            if ( type == DownsampleType.ALL_READS && toCoverage != null ) {
+                throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation");
+            }
+        }
+    }
+
+    public String toString() {
+        StringBuilder builder = new StringBuilder("Downsampling Settings: ");
+
+        if ( type == DownsampleType.NONE ) {
+            builder.append("No downsampling");
+        }
+        else {
+            builder.append(String.format("Method: %s ", type));
+
+            if ( toCoverage != null ) {
+                builder.append(String.format("Target Coverage: %d ", toCoverage));
+            }
+            else {
+                builder.append(String.format("Target Fraction: %.2f ", toFraction));
+            }
+
+            if ( useExperimentalDownsampling ) {
+                builder.append("Using Experimental Downsampling");
+            }
+        }
+
+        return builder.toString();
+    }
+
+    public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) {
+        if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) {
+            return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE,
+                                          null, useExperimentalDownsampling);
+        }
+        else {
+            return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling);
+        }
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java
index bccc2e946..c8fbc829c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIterator.java
@@ -33,7 +33,8 @@ import java.util.NoSuchElementException;
 
 
 /**
- * StingSAMIterator wrapper around our generic reads downsampler interface
+ * StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style
+ * downsampler interface to a pull model.
  *
  * @author David Roazen
  */
@@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
     private StingSAMIterator nestedSAMIterator;
     private ReadsDownsampler downsampler;
     private Collection downsampledReadsCache;
-    private Iterator downsampledReadsCacheIterator;
+    private SAMRecord nextRead = null;
+    private Iterator downsampledReadsCacheIterator = null;
 
+    /**
+     * @param iter wrapped iterator from which this iterator will pull reads
+     * @param downsampler downsampler through which the reads will be fed
+     */
     public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler downsampler ) {
         nestedSAMIterator = iter;
         this.downsampler = downsampler;
-        fillDownsampledReadsCache();
+
+        advanceToNextRead();
     }
 
     public boolean hasNext() {
-        if ( downsampledReadsCacheIterator.hasNext() ) {
-            return true;
-        }
-        else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
-            return false;
-        }
-
-        return true;
+        return nextRead != null;
     }
 
     public SAMRecord next() {
-        if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
+        if ( nextRead == null ) {
             throw new NoSuchElementException("next() called when there are no more items");
         }
 
-        return downsampledReadsCacheIterator.next();
+        SAMRecord toReturn = nextRead;
+        advanceToNextRead();
+
+        return toReturn;
+    }
+
+    private void advanceToNextRead() {
+        if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
+            nextRead = null;
+        }
+        else {
+            nextRead = downsampledReadsCacheIterator.next();
+        }
+    }
+
+    private boolean readyToReleaseReads() {
+        return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext();
     }
 
     private boolean fillDownsampledReadsCache() {
-        while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
+        while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) {
             downsampler.submit(nestedSAMIterator.next());
         }
 
@@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
             downsampler.signalEndOfInput();
         }
 
-        downsampledReadsCache = downsampler.consumeDownsampledItems();
+        // use returned collection directly rather than make a copy, for speed
+        downsampledReadsCache = downsampler.consumeFinalizedItems();
         downsampledReadsCacheIterator = downsampledReadsCache.iterator();
 
         return downsampledReadsCacheIterator.hasNext();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java
index d5d529c9f..8901ae525 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsampler.java
@@ -33,7 +33,10 @@ import java.util.Collection;
 import java.util.List;
 
 /**
- * Fractional Downsampler: selects a specified fraction of the reads for inclusion
+ * Fractional Downsampler: selects a specified fraction of the reads for inclusion.
+ *
+ * Since the selection is done randomly, the actual fraction of reads retained may be slightly
+ * more or less than the requested fraction, depending on the total number of reads submitted.
  *
  * @author David Roazen
  */
@@ -43,8 +46,16 @@ public class FractionalDownsampler implements ReadsDownsamp
 
     private int cutoffForInclusion;
 
+    private int numDiscardedItems;
+
     private static final int RANDOM_POOL_SIZE = 10000;
 
+    /**
+     * Construct a FractionalDownsampler
+     *
+     * @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive).
+     *                 Actual number of reads preserved may differ randomly.
+     */
     public FractionalDownsampler( double fraction ) {
         if ( fraction < 0.0 || fraction > 1.0 ) {
             throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
@@ -52,12 +63,16 @@ public class FractionalDownsampler implements ReadsDownsamp
 
         cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
         clear();
+        reset();
     }
 
     public void submit( T newRead ) {
         if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
             selectedReads.add(newRead);
         }
+        else {
+            numDiscardedItems++;
+        }
     }
 
     public void submit( Collection newReads ) {
@@ -66,11 +81,12 @@ public class FractionalDownsampler implements ReadsDownsamp
         }
     }
 
-    public boolean hasDownsampledItems() {
+    public boolean hasFinalizedItems() {
         return selectedReads.size() > 0;
     }
 
-    public List consumeDownsampledItems() {
+    public List consumeFinalizedItems() {
+        // pass by reference rather than make a copy, for speed
         List downsampledItems = selectedReads;
         clear();
         return downsampledItems;
@@ -80,6 +96,18 @@ public class FractionalDownsampler implements ReadsDownsamp
         return false;
     }
 
+    public T peekFinalized() {
+        return selectedReads.isEmpty() ? null : selectedReads.get(0);
+    }
+
+    public T peekPending() {
+        return null;
+    }
+
+    public int getNumberOfDiscardedItems() {
+        return numDiscardedItems;
+    }
+
     public void signalEndOfInput() {
         // NO-OP
     }
@@ -88,7 +116,15 @@ public class FractionalDownsampler implements ReadsDownsamp
         selectedReads = new ArrayList();
     }
 
+    public void reset() {
+        numDiscardedItems = 0;
+    }
+
     public boolean requiresCoordinateSortOrder() {
         return false;
     }
+
+    public void signalNoMoreReadsBefore( T read ) {
+        // NO-OP
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java
new file mode 100644
index 000000000..7a7c9e91e
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerFactory.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.downsampling;
+
+import net.sf.samtools.SAMRecord;
+
+/**
+ * Factory for creating FractionalDownsamplers on demand
+ *
+ * @author David Roazen
+ */
+public class FractionalDownsamplerFactory implements ReadsDownsamplerFactory {
+
+    private double fraction;
+
+    public FractionalDownsamplerFactory( double fraction ) {
+        this.fraction = fraction;
+    }
+
+    public ReadsDownsampler newInstance() {
+        return new FractionalDownsampler(fraction);
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java
new file mode 100644
index 000000000..73d69140d
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/LevelingDownsampler.java
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.downsampling;
+
+import org.broadinstitute.sting.utils.MathUtils;
+
+import java.util.*;
+
+/**
+ * Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from
+ * the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling
+ * does not occur until all Lists have been submitted and signalEndOfInput() is called.
+ *
+ * The Lists should be LinkedLists for maximum efficiency during item removal, however other
+ * kinds of Lists are also accepted (albeit at a slight performance penalty).
+ *
+ * Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface,
+ * the Lists need not contain reads. However this downsampler may not be wrapped within one of the
+ * DownsamplingReadsIterators
+ *
+ * @param  the List type representing the stacks to be leveled
+ * @param  the type of the elements of each List
+ *
+ * @author David Roazen
+ */
+public class LevelingDownsampler, E> implements Downsampler {
+
+    private int targetSize;
+
+    private List groups;
+
+    private boolean groupsAreFinalized;
+
+    private int numDiscardedItems;
+
+    /**
+     * Construct a LevelingDownsampler
+     *
+     * @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
+     *                   this value -- if it does, items are removed from Lists evenly until the total size
+     *                   is <= this value
+     */
+    public LevelingDownsampler( int targetSize ) {
+        this.targetSize = targetSize;
+        clear();
+        reset();
+    }
+
+    public void submit( T item ) {
+        groups.add(item);
+    }
+
+    public void submit( Collection items ){
+        groups.addAll(items);
+    }
+
+    public boolean hasFinalizedItems() {
+        return groupsAreFinalized && groups.size() > 0;
+    }
+
+    public List consumeFinalizedItems() {
+        if ( ! hasFinalizedItems() ) {
+            return new ArrayList();
+        }
+
+        // pass by reference rather than make a copy, for speed
+        List toReturn = groups;
+        clear();
+        return toReturn;
+    }
+
+    public boolean hasPendingItems() {
+        return ! groupsAreFinalized && groups.size() > 0;
+    }
+
+    public T peekFinalized() {
+        return hasFinalizedItems() ? groups.get(0) : null;
+    }
+
+    public T peekPending() {
+        return hasPendingItems() ? groups.get(0) : null;
+    }
+
+    public int getNumberOfDiscardedItems() {
+        return numDiscardedItems;
+    }
+
+    public void signalEndOfInput() {
+        levelGroups();
+        groupsAreFinalized = true;
+    }
+
+    public void clear() {
+        groups = new ArrayList();
+        groupsAreFinalized = false;
+    }
+
+    public void reset() {
+        numDiscardedItems = 0;
+    }
+
+    private void levelGroups() {
+        int totalSize = 0;
+        int[] groupSizes = new int[groups.size()];
+        int currentGroupIndex = 0;
+
+        for ( T group : groups ) {
+            groupSizes[currentGroupIndex] = group.size();
+            totalSize += groupSizes[currentGroupIndex];
+            currentGroupIndex++;
+        }
+
+        if ( totalSize <= targetSize ) {
+            return;    // no need to eliminate any items
+        }
+
+        // We will try to remove exactly this many items, however we will refuse to allow any
+        // one group to fall below size 1, and so might end up removing fewer items than this
+        int numItemsToRemove = totalSize - targetSize;
+
+        currentGroupIndex = 0;
+        int numConsecutiveUmodifiableGroups = 0;
+
+        // Continue until we've either removed all the items we wanted to, or we can't
+        // remove any more items without violating the constraint that all groups must
+        // be left with at least one item
+        while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) {
+            if ( groupSizes[currentGroupIndex] > 1 ) {
+                groupSizes[currentGroupIndex]--;
+                numItemsToRemove--;
+                numConsecutiveUmodifiableGroups = 0;
+            }
+            else {
+                numConsecutiveUmodifiableGroups++;
+            }
+
+            currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length;
+        }
+
+        // Now we actually go through and reduce each group to its new count as specified in groupSizes
+        currentGroupIndex = 0;
+        for ( T group : groups ) {
+            downsampleOneGroup(group, groupSizes[currentGroupIndex]);
+            currentGroupIndex++;
+        }
+    }
+
+    private void downsampleOneGroup( T group, int numItemsToKeep ) {
+        if ( numItemsToKeep >= group.size() ) {
+            return;
+        }
+
+        numDiscardedItems += group.size() - numItemsToKeep;
+
+        BitSet itemsToKeep = new BitSet(group.size());
+        for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) {
+            itemsToKeep.set(selectedIndex);
+        }
+
+        int currentIndex = 0;
+
+        // If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator
+        if ( group instanceof LinkedList ) {
+            Iterator iter = group.iterator();
+            while ( iter.hasNext() ) {
+                iter.next();
+
+                if ( ! itemsToKeep.get(currentIndex) ) {
+                    iter.remove();
+                }
+
+                currentIndex++;
+            }
+        }
+        // If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather
+        // than suffer O(n^2) of item shifting
+        else {
+            List keptItems = new ArrayList(numItemsToKeep);
+
+            for ( E item : group ) {
+                if ( itemsToKeep.get(currentIndex) ) {
+                    keptItems.add(item);
+                }
+                currentIndex++;
+            }
+            group.clear();
+            group.addAll(keptItems);
+        }
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java
new file mode 100644
index 000000000..5275c471e
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIterator.java
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.downsampling;
+
+import net.sf.samtools.SAMRecord;
+import net.sf.samtools.SAMRecordComparator;
+import net.sf.samtools.SAMRecordCoordinateComparator;
+import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
+
+import java.util.*;
+
+
+/**
+ * StingSAMIterator wrapper around our generic reads downsampler interface
+ * that downsamples reads for each sample independently, and then re-assembles
+ * the reads back into a single merged stream.
+ *
+ * @author David Roazen
+ */
+public class PerSampleDownsamplingReadsIterator implements StingSAMIterator {
+
+    private StingSAMIterator nestedSAMIterator;
+    private ReadsDownsamplerFactory downsamplerFactory;
+    private Map> perSampleDownsamplers;
+    private PriorityQueue orderedDownsampledReadsCache;
+    private SAMRecord nextRead = null;
+    private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator();
+    private SAMRecord earliestPendingRead = null;
+    private ReadsDownsampler earliestPendingDownsampler = null;
+
+    // Initial size of our cache of finalized reads
+    private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096;
+
+    // The number of positional changes that can occur in the read stream before all downsamplers
+    // should be informed of the current position (guards against samples with relatively sparse reads
+    // getting stuck in a pending state):
+    private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3;   // TODO: experiment with this value
+
+    /**
+     * @param iter wrapped iterator from which this iterator will pull reads
+     * @param downsamplerFactory factory used to create new downsamplers as needed
+     */
+    public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory downsamplerFactory ) {
+        nestedSAMIterator = iter;
+        this.downsamplerFactory = downsamplerFactory;
+        perSampleDownsamplers = new HashMap>();
+        orderedDownsampledReadsCache = new PriorityQueue(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator);
+
+        advanceToNextRead();
+    }
+
+    public boolean hasNext() {
+        return nextRead != null;
+    }
+
+    public SAMRecord next() {
+        if ( nextRead == null ) {
+            throw new NoSuchElementException("next() called when there are no more items");
+        }
+
+        SAMRecord toReturn = nextRead;
+        advanceToNextRead();
+
+        return toReturn;
+    }
+
+    private void advanceToNextRead() {
+        if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
+            nextRead = null;
+        }
+        else {
+            nextRead = orderedDownsampledReadsCache.poll();
+        }
+    }
+
+    private boolean readyToReleaseReads() {
+        if ( orderedDownsampledReadsCache.isEmpty() ) {
+            return false;
+        }
+
+        return earliestPendingRead == null ||
+               readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0;
+    }
+
+    private void updateEarliestPendingRead( ReadsDownsampler currentDownsampler ) {
+        // If there is no recorded earliest pending read and this downsampler has pending items,
+        // then this downsampler's first pending item becomes the new earliest pending read:
+        if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) {
+            earliestPendingRead = currentDownsampler.peekPending();
+            earliestPendingDownsampler = currentDownsampler;
+        }
+        // In all other cases, we only need to update the earliest pending read when the downsampler
+        // associated with it experiences a change in its pending reads, since by assuming a sorted
+        // read stream we're assured that each downsampler's earliest pending read will only increase
+        // in genomic position over time.
+        //
+        // TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers
+        // TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))),
+        // TODO: but need to verify this empirically.
+        else if ( currentDownsampler == earliestPendingDownsampler &&
+                  (! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) {
+
+            earliestPendingRead = null;
+            earliestPendingDownsampler = null;
+            for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) {
+                if ( perSampleDownsampler.hasPendingItems() &&
+                     (earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) {
+
+                    earliestPendingRead = perSampleDownsampler.peekPending();
+                    earliestPendingDownsampler = perSampleDownsampler;
+                }
+            }
+        }
+    }
+
+    private boolean fillDownsampledReadsCache() {
+        SAMRecord prevRead = null;
+        int numPositionalChanges = 0;
+
+        // Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue
+        // can be released without violating global sort order
+        while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) {
+            SAMRecord read = nestedSAMIterator.next();
+            String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
+
+            ReadsDownsampler thisSampleDownsampler = perSampleDownsamplers.get(sampleName);
+            if ( thisSampleDownsampler == null ) {
+                thisSampleDownsampler = downsamplerFactory.newInstance();
+                perSampleDownsamplers.put(sampleName, thisSampleDownsampler);
+            }
+
+            thisSampleDownsampler.submit(read);
+            updateEarliestPendingRead(thisSampleDownsampler);
+
+            if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) {
+                numPositionalChanges++;
+            }
+
+            // Periodically inform all downsamplers of the current position in the read stream. This is
+            // to prevent downsamplers for samples with sparser reads than others from getting stuck too
+            // long in a pending state.
+            if ( numPositionalChanges > 0 && numPositionalChanges % DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL == 0 ) {
+                for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) {
+                    perSampleDownsampler.signalNoMoreReadsBefore(read);
+                    updateEarliestPendingRead(perSampleDownsampler);
+                }
+            }
+
+            prevRead = read;
+        }
+
+        if ( ! nestedSAMIterator.hasNext() ) {
+            for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) {
+                perSampleDownsampler.signalEndOfInput();
+            }
+            earliestPendingRead = null;
+            earliestPendingDownsampler = null;
+        }
+
+        for ( ReadsDownsampler perSampleDownsampler : perSampleDownsamplers.values() ) {
+            if ( perSampleDownsampler.hasFinalizedItems() ) {
+                orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems());
+            }
+        }
+
+        return readyToReleaseReads();
+    }
+
+    public void remove() {
+        throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
+    }
+
+    public void close() {
+        nestedSAMIterator.close();
+    }
+
+    public Iterator iterator() {
+        return this;
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java
deleted file mode 100644
index f29c7728c..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/PositionalDownsampler.java
+++ /dev/null
@@ -1,259 +0,0 @@
-/*
- * Copyright (c) 2012, The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.downsampling;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.MathUtils;
-import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
-
-import java.util.*;
-
-/**
- * Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
- *
- * @author David Roazen
- */
-public class PositionalDownsampler implements ReadsDownsampler {
-
-    private int targetCoverage;
-
-    private ReservoirDownsampler reservoir;
-
-    private int currentContigIndex;
-
-    private int currentAlignmentStart;
-
-    private LinkedList pendingReads;
-
-    private ArrayList finalizedReads;
-
-    public PositionalDownsampler ( int targetCoverage ) {
-        this.targetCoverage = targetCoverage;
-        clear();
-    }
-
-    public void submit ( T newRead ) {
-        if ( readIsPastCurrentPosition(newRead) ) {
-            updateAndDownsamplePendingReads();
-        }
-
-        reservoir.submit(newRead);
-        updateCurrentPosition(newRead);
-    }
-
-    public void submit ( Collection newReads ) {
-        for ( T read : newReads ) {
-            submit(read);
-        }
-    }
-
-    public boolean hasDownsampledItems() {
-        return finalizedReads.size() > 0;
-    }
-
-    public List consumeDownsampledItems() {
-        List toReturn = finalizedReads;
-        finalizedReads = new ArrayList();
-        return toReturn;
-    }
-
-    public boolean hasPendingItems() {
-        return pendingReads.size() > 0;
-    }
-
-    public void signalEndOfInput() {
-        updateAndDownsamplePendingReads();
-
-        for ( PositionalReadGrouping group : pendingReads ) {
-            group.finalizeAllActiveReads();
-            finalizedReads.addAll(group.getFinalizedReads());
-        }
-
-        pendingReads.clear();
-    }
-
-    public void clear() {
-        reservoir = new ReservoirDownsampler(targetCoverage);
-        pendingReads = new LinkedList();
-        finalizedReads = new ArrayList();
-    }
-
-    public boolean requiresCoordinateSortOrder() {
-        return true;
-    }
-
-    private void updateCurrentPosition ( T read ) {
-        currentContigIndex = read.getReferenceIndex();
-        currentAlignmentStart = read.getAlignmentStart();
-    }
-
-    private boolean readIsPastCurrentPosition ( T read ) {
-        return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
-    }
-
-    private void updateAndDownsamplePendingReads() {
-        finalizeOutOfScopeReads();
-
-        List oldLocusReads = reservoir.consumeDownsampledItems();
-        pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
-
-        downsampleOverlappingGroups();
-    }
-
-    private void finalizeOutOfScopeReads() {
-        Iterator iter = pendingReads.iterator();
-        boolean noPrecedingUnfinalizedGroups = true;
-
-        while ( iter.hasNext() ) {
-            PositionalReadGrouping currentGroup = iter.next();
-            currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
-
-            if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
-                iter.remove();
-                finalizedReads.addAll(currentGroup.getFinalizedReads());
-            }
-            else {
-                noPrecedingUnfinalizedGroups = false;
-            }
-        }
-    }
-
-    private void downsampleOverlappingGroups() {
-        int[] groupReadCounts = new int[pendingReads.size()];
-        int totalCoverage = 0;
-        int numActiveGroups = 0;
-        int currentGroup = 0;
-
-        for ( PositionalReadGrouping group : pendingReads ) {
-            groupReadCounts[currentGroup] = group.numActiveReads();
-            totalCoverage += groupReadCounts[currentGroup];
-
-            if ( groupReadCounts[currentGroup] > 0 ) {
-                numActiveGroups++;
-            }
-
-            currentGroup++;
-        }
-
-        if ( totalCoverage <= targetCoverage ) {
-            return;
-        }
-
-        int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
-        currentGroup = 0;
-
-        while ( numReadsToRemove > 0  ) {
-            if ( groupReadCounts[currentGroup] > 1 ) {
-                groupReadCounts[currentGroup]--;
-                numReadsToRemove--;
-            }
-
-            currentGroup = (currentGroup + 1) % groupReadCounts.length;
-        }
-
-        currentGroup = 0;
-        for ( PositionalReadGrouping group : pendingReads ) {
-            if ( ! group.isFinalized() ) {
-                group.downsampleActiveReads(groupReadCounts[currentGroup]);
-            }
-            currentGroup++;
-        }
-    }
-
-    private class PositionalReadGrouping {
-        private List activeReads;
-        private List finalizedReads;
-
-        private int contig;
-        private int alignmentStart;
-
-        public PositionalReadGrouping( Collection reads, int contig, int alignmentStart ) {
-            activeReads = new LinkedList(reads);
-            finalizedReads = new ArrayList();
-            this.contig = contig;
-            this.alignmentStart = alignmentStart;
-        }
-
-        public int numActiveReads() {
-            return activeReads.size();
-        }
-
-        public boolean isFinalized() {
-            return activeReads.size() == 0;
-        }
-
-        public List getFinalizedReads() {
-            return finalizedReads;
-        }
-
-        public void finalizeActiveReadsBeforePosition( int contig, int position ) {
-            if ( this.contig != contig ) {
-                finalizeAllActiveReads();
-                return;
-            }
-
-            Iterator iter = activeReads.iterator();
-
-            while ( iter.hasNext() ) {
-                T read = iter.next();
-                if ( read.getAlignmentEnd() < position ) {
-                    iter.remove();
-                    finalizedReads.add(read);
-                }
-            }
-        }
-
-        public void finalizeAllActiveReads() {
-            finalizedReads.addAll(activeReads);
-            activeReads.clear();
-        }
-
-        public void downsampleActiveReads( int numReadsToKeep ) {
-            if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
-                throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
-                                                               numReadsToKeep, activeReads.size()));
-            }
-
-            BitSet itemsToKeep = new BitSet(activeReads.size());
-            for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
-                itemsToKeep.set(selectedIndex);
-            }
-
-            int currentIndex = 0;
-            Iterator iter = activeReads.iterator();
-
-            while ( iter.hasNext() ) {
-                T read = iter.next();
-
-                if ( ! itemsToKeep.get(currentIndex) ) {
-                    iter.remove();
-                }
-
-                currentIndex++;
-            }
-        }
-
-    }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java
index f78aaf4bf..3ff6f4454 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsampler.java
@@ -33,8 +33,23 @@ import net.sf.samtools.SAMRecord;
  */
 public interface ReadsDownsampler extends Downsampler {
 
-    /*
+    /**
      * Does this downsampler require that reads be fed to it in coordinate order?
+     *
+     * @return true if reads must be submitted to this downsampler in coordinate order, otherwise false
      */
     public boolean requiresCoordinateSortOrder();
+
+    /**
+     * Tell this downsampler that no more reads located before the provided read (according to
+     * the sort order of the read stream) will be fed to it.
+     *
+     * Allows position-aware downsamplers to finalize pending reads earlier than they would
+     * otherwise be able to, particularly when doing per-sample downsampling and reads for
+     * certain samples are sparser than average.
+     *
+     * @param read the downsampler will assume that no reads located before this read will ever
+     *             be submitted to it in the future
+     */
+    public void signalNoMoreReadsBefore( T read );
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java
new file mode 100644
index 000000000..2fa32497b
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReadsDownsamplerFactory.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.downsampling;
+
+import net.sf.samtools.SAMRecord;
+
+/**
+ * A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular
+ * downsampler, all sharing the same construction parameters.
+ *
+ * @author David Roazen
+ */
+public interface ReadsDownsamplerFactory {
+    public ReadsDownsampler newInstance();
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java
index cb40c7042..bab4734c4 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsampler.java
@@ -48,6 +48,14 @@ public class ReservoirDownsampler implements ReadsDownsampl
 
     private int totalReadsSeen;
 
+    private int numDiscardedItems;
+
+    /**
+     * Construct a ReservoirDownsampler
+     *
+     * @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
+     *                         after downsampling will be min(totalReads, targetSampleSize)
+     */
     public ReservoirDownsampler ( int targetSampleSize ) {
         if ( targetSampleSize <= 0 ) {
             throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
@@ -55,6 +63,7 @@ public class ReservoirDownsampler implements ReadsDownsampl
 
         this.targetSampleSize = targetSampleSize;
         clear();
+        reset();
     }
 
     public void submit ( T newRead ) {
@@ -68,6 +77,7 @@ public class ReservoirDownsampler implements ReadsDownsampl
             if ( randomSlot < targetSampleSize ) {
                 reservoir.set(randomSlot, newRead);
             }
+            numDiscardedItems++;
         }
     }
 
@@ -77,11 +87,12 @@ public class ReservoirDownsampler implements ReadsDownsampl
         }
     }
 
-    public boolean hasDownsampledItems() {
+    public boolean hasFinalizedItems() {
         return reservoir.size() > 0;
     }
 
-    public List consumeDownsampledItems() {
+    public List consumeFinalizedItems() {
+        // pass by reference rather than make a copy, for speed
         List downsampledItems = reservoir;
         clear();
         return downsampledItems;
@@ -91,16 +102,36 @@ public class ReservoirDownsampler implements ReadsDownsampl
         return false;
     }
 
+    public T peekFinalized() {
+        return reservoir.isEmpty() ? null : reservoir.get(0);
+    }
+
+    public T peekPending() {
+        return null;
+    }
+
+    public int getNumberOfDiscardedItems() {
+        return numDiscardedItems;
+    }
+
     public void signalEndOfInput() {
         // NO-OP
     }
 
     public void clear() {
         reservoir = new ArrayList(targetSampleSize);
-        totalReadsSeen = 0;
+        totalReadsSeen = 0;    // an internal stat used by the downsampling process, so not cleared by reset() below
+    }
+
+    public void reset() {
+        numDiscardedItems = 0;
     }
 
     public boolean requiresCoordinateSortOrder() {
         return false;
     }
+
+    public void signalNoMoreReadsBefore( T read ) {
+        // NO-OP
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java
new file mode 100644
index 000000000..040f0c788
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerFactory.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.downsampling;
+
+import net.sf.samtools.SAMRecord;
+
+/**
+ * Factory for creating ReservoirDownsamplers on demand
+ *
+ * @author David Roazen
+ */
+public class ReservoirDownsamplerFactory implements ReadsDownsamplerFactory {
+
+    private int targetSampleSize;
+
+    public ReservoirDownsamplerFactory( int targetSampleSize ) {
+        this.targetSampleSize = targetSampleSize;
+    }
+
+    public ReadsDownsampler newInstance() {
+        return new ReservoirDownsampler(targetSampleSize);
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java
new file mode 100644
index 000000000..30affc2b3
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsampler.java
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.downsampling;
+
+import net.sf.samtools.SAMRecord;
+
+import java.util.*;
+
+/**
+ * Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage
+ * using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time.
+ *
+ * @author David Roazen
+ */
+public class SimplePositionalDownsampler implements ReadsDownsampler {
+
+    private int targetCoverage;
+
+    private ReservoirDownsampler reservoir;
+
+    private int currentContigIndex;
+
+    private int currentAlignmentStart;
+
+    private boolean positionEstablished;
+
+    private boolean unmappedReadsReached;
+
+    private ArrayList finalizedReads;
+
+    private int numDiscardedItems;
+
+    /**
+     * Construct a SimplePositionalDownsampler
+     *
+     * @param targetCoverage Maximum number of reads that may share any given alignment start position
+     */
+    public SimplePositionalDownsampler( int targetCoverage ) {
+        this.targetCoverage = targetCoverage;
+        reservoir = new ReservoirDownsampler(targetCoverage);
+        finalizedReads = new ArrayList();
+        clear();
+        reset();
+    }
+
+    public void submit( T newRead ) {
+        updatePositionalState(newRead);
+
+        if ( unmappedReadsReached ) {    // don't downsample the unmapped reads at the end of the stream
+            finalizedReads.add(newRead);
+        }
+        else {
+            int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems();
+            reservoir.submit(newRead);
+            numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems;
+        }
+    }
+
+    public void submit( Collection newReads ) {
+        for ( T read : newReads ) {
+            submit(read);
+        }
+    }
+
+    public boolean hasFinalizedItems() {
+        return finalizedReads.size() > 0;
+    }
+
+    public List consumeFinalizedItems() {
+        // pass by reference rather than make a copy, for speed
+        List toReturn = finalizedReads;
+        finalizedReads = new ArrayList();
+        return toReturn;
+    }
+
+    public boolean hasPendingItems() {
+        return reservoir.hasFinalizedItems();
+    }
+
+    public T peekFinalized() {
+        return finalizedReads.isEmpty() ? null : finalizedReads.get(0);
+    }
+
+    public T peekPending() {
+        return reservoir.peekFinalized();
+    }
+
+    public int getNumberOfDiscardedItems() {
+        return numDiscardedItems;
+    }
+
+    public void signalEndOfInput() {
+        finalizeReservoir();
+    }
+
+    public void clear() {
+        reservoir.clear();
+        reservoir.reset();
+        finalizedReads.clear();
+        positionEstablished = false;
+        unmappedReadsReached = false;
+    }
+
+    public void reset() {
+        numDiscardedItems = 0;
+    }
+
+    public boolean requiresCoordinateSortOrder() {
+        return true;
+    }
+
+    public void signalNoMoreReadsBefore( T read ) {
+        updatePositionalState(read);
+    }
+
+    private void updatePositionalState( T newRead ) {
+        if ( readIsPastCurrentPosition(newRead) ) {
+            if ( reservoir.hasFinalizedItems() ) {
+                finalizeReservoir();
+            }
+
+            setCurrentPosition(newRead);
+
+            if ( newRead.getReadUnmappedFlag() ) {
+                unmappedReadsReached = true;
+            }
+        }
+    }
+
+    private void setCurrentPosition( T read ) {
+        currentContigIndex = read.getReferenceIndex();
+        currentAlignmentStart = read.getAlignmentStart();
+        positionEstablished = true;
+    }
+
+    private boolean readIsPastCurrentPosition( T read ) {
+        return ! positionEstablished ||
+               read.getReferenceIndex() > currentContigIndex ||
+               read.getAlignmentStart() > currentAlignmentStart ||
+               (read.getReadUnmappedFlag() && ! unmappedReadsReached);
+    }
+
+    private void finalizeReservoir() {
+        finalizedReads.addAll(reservoir.consumeFinalizedItems());
+        reservoir.reset();
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java
new file mode 100644
index 000000000..fcc18b16b
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerFactory.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2012, The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.downsampling;
+
+import net.sf.samtools.SAMRecord;
+
+/**
+ * Factory for creating SimplePositionalDownsamplers on demand
+ *
+ * @author David Roazen
+ */
+public class SimplePositionalDownsamplerFactory implements ReadsDownsamplerFactory {
+
+    private int targetCoverage;
+
+    public SimplePositionalDownsamplerFactory( int targetCoverage ) {
+        this.targetCoverage = targetCoverage;
+    }
+
+    public ReadsDownsampler newInstance() {
+        return new SimplePositionalDownsampler(targetCoverage);
+    }
+}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
index 70b1be0e1..cc0161b2d 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroScheduler.java
@@ -7,10 +7,13 @@ import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
 import org.broadinstitute.sting.gatk.datasources.reads.Shard;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.io.OutputTracker;
-import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
+import org.broadinstitute.sting.gatk.io.ThreadGroupOutputTracker;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
 import org.broadinstitute.sting.gatk.walkers.TreeReducible;
 import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.utils.MultiThreadedErrorTracker;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
 
 import java.util.Collection;
@@ -36,14 +39,14 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
     /**
      * A thread local output tracker for managing output per-thread.
      */
-    private ThreadLocalOutputTracker outputTracker = new ThreadLocalOutputTracker();
+    private ThreadGroupOutputTracker outputTracker = new ThreadGroupOutputTracker();
 
     private final Queue reduceTasks = new LinkedList();
 
     /**
      * An exception that's occurred in this traversal.  If null, no exception has occurred.
      */
-    private RuntimeException error = null;
+    final MultiThreadedErrorTracker errorTracker = new MultiThreadedErrorTracker();
 
     /**
      * Queue of incoming shards.
@@ -75,14 +78,39 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
     /**
      * Create a new hierarchical microscheduler to process the given reads and reference.
      *
-     * @param walker        the walker used to process the dataset.
-     * @param reads         Reads file(s) to process.
-     * @param reference     Reference for driving the traversal.
-     * @param nThreadsToUse maximum number of threads to use to do the work
+     * @param walker           the walker used to process the dataset.
+     * @param reads            Reads file(s) to process.
+     * @param reference        Reference for driving the traversal.
+     * @param threadAllocation How should we apply multi-threaded execution?
      */
-    protected HierarchicalMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, int nThreadsToUse ) {
-        super(engine, walker, reads, reference, rods);
-        this.threadPool = Executors.newFixedThreadPool(nThreadsToUse);
+    protected HierarchicalMicroScheduler(final GenomeAnalysisEngine engine,
+                                         final Walker walker,
+                                         final SAMDataSource reads,
+                                         final IndexedFastaSequenceFile reference,
+                                         final Collection rods,
+                                         final ThreadAllocation threadAllocation) {
+        super(engine, walker, reads, reference, rods, threadAllocation);
+
+        final int nThreadsToUse = threadAllocation.getNumDataThreads();
+        if ( threadAllocation.monitorThreadEfficiency() ) {
+            throw new UserException.BadArgumentValue("nt", "Cannot monitor thread efficiency with -nt, sorry");
+        }
+
+        this.threadPool = Executors.newFixedThreadPool(nThreadsToUse, new UniqueThreadGroupThreadFactory());
+    }
+
+    /**
+     * Creates threads for HMS each with a unique thread group.  Critical to
+     * track outputs via the ThreadGroupOutputTracker.
+     */
+    private static class UniqueThreadGroupThreadFactory implements ThreadFactory {
+        int counter = 0;
+
+        @Override
+        public Thread newThread(Runnable r) {
+            final ThreadGroup group = new ThreadGroup("HMS-group-" + counter++);
+            return new Thread(group, r);
+        }
     }
 
     public Object execute( Walker walker, Iterable shardStrategy ) {
@@ -92,13 +120,12 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
 
         this.traversalTasks = shardStrategy.iterator();
 
-        ReduceTree reduceTree = new ReduceTree(this);
+        final ReduceTree reduceTree = new ReduceTree(this);
         initializeWalker(walker);
 
-        while (isShardTraversePending() || isTreeReducePending()) {
+        while (! abortExecution() && (isShardTraversePending() || isTreeReducePending())) {
             // Check for errors during execution.
-            if(hasTraversalErrorOccurred())
-                throw getTraversalError();
+            errorTracker.throwErrorIfPending();
 
             // Too many files sitting around taking up space?  Merge them.
             if (isMergeLimitExceeded())
@@ -115,8 +142,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
                 queueNextShardTraverse(walker, reduceTree);
         }
 
-        if(hasTraversalErrorOccurred())
-            throw getTraversalError();
+        errorTracker.throwErrorIfPending();
 
         threadPool.shutdown();
 
@@ -132,7 +158,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
             throw ex;
         } catch ( ExecutionException ex ) {
             // the thread died and we are failing to get the result, rethrow it as a runtime exception
-            throw toRuntimeException(ex.getCause());
+            throw notifyOfTraversalError(ex.getCause());
         } catch (Exception ex) {
             throw new ReviewedStingException("Unable to retrieve result", ex);
         }
@@ -140,6 +166,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
         // do final cleanup operations
         outputTracker.close();
         cleanup();
+        executionIsDone();
 
         return result;
     }
@@ -170,7 +197,6 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
         outputTracker.bypassThreadLocalStorage(true);
         try {
             walker.onTraversalDone(result);
-            printOnTraversalDone(result);
         }
         finally {
             outputTracker.bypassThreadLocalStorage(false);
@@ -239,6 +265,9 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
     protected void mergeExistingOutput( boolean wait ) {
         long startTime = System.currentTimeMillis();
 
+//        logger.warn("MergingExistingOutput");
+//        printOutputMergeTasks();
+
         // Create a list of the merge tasks that will be performed in this run of the mergeExistingOutput().
         Queue mergeTasksInSession = new LinkedList();
         while( !outputMergeTasks.isEmpty() ) {
@@ -252,8 +281,12 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
             mergeTasksInSession.add(traverser);
         }
 
+//        logger.warn("Selected things to merge:");
+//        printOutputMergeTasks(mergeTasksInSession);
+
         // Actually run through, merging the tasks in the working queue.
         for( ShardTraverser traverser: mergeTasksInSession ) {
+            //logger.warn("*** Merging " + traverser.getIntervalsString());
             if( !traverser.isComplete() )
                 traverser.waitForComplete();
 
@@ -286,32 +319,41 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
         if (!traversalTasks.hasNext())
             throw new IllegalStateException("Cannot traverse; no pending traversals exist.");
 
-        Shard shard = traversalTasks.next();
+        final Shard shard = traversalTasks.next();
 
         // todo -- add ownership claim here
 
-        ShardTraverser traverser = new ShardTraverser(this,
-                traversalEngine,
-                walker,
-                shard,
-                outputTracker);
+        final ShardTraverser traverser = new ShardTraverser(this, walker, shard, outputTracker);
 
-        Future traverseResult = threadPool.submit(traverser);
+        final Future traverseResult = threadPool.submit(traverser);
 
         // Add this traverse result to the reduce tree.  The reduce tree will call a callback to throw its entries on the queue.
         reduceTree.addEntry(traverseResult);
         outputMergeTasks.add(traverser);
 
+//        logger.warn("adding merge task");
+//        printOutputMergeTasks();
+
         // No more data?  Let the reduce tree know so it can finish processing what it's got.
         if (!isShardTraversePending())
             reduceTree.complete();
     }
 
+    private synchronized void printOutputMergeTasks() {
+        printOutputMergeTasks(outputMergeTasks);
+    }
+
+    private synchronized void printOutputMergeTasks(final Queue tasks) {
+        logger.info("Output merge tasks " + tasks.size());
+        for ( final ShardTraverser traverser : tasks )
+            logger.info(String.format("\t%s: complete? %b", traverser.getIntervalsString(), traverser.isComplete()));
+    }
+
     /** Pulls the next reduce from the queue and runs it. */
     protected void queueNextTreeReduce( Walker walker ) {
         if (reduceTasks.size() == 0)
             throw new IllegalStateException("Cannot reduce; no pending reduces exist.");
-        TreeReduceTask reducer = reduceTasks.remove();
+        final TreeReduceTask reducer = reduceTasks.remove();
         reducer.setWalker((TreeReducible) walker);
 
         threadPool.submit(reducer);
@@ -319,7 +361,7 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
 
     /** Blocks until a free slot appears in the thread queue. */
     protected void waitForFreeQueueSlot() {
-        ThreadPoolMonitor monitor = new ThreadPoolMonitor();
+        final ThreadPoolMonitor monitor = new ThreadPoolMonitor();
         synchronized (monitor) {
             threadPool.submit(monitor);
             monitor.watch();
@@ -331,47 +373,22 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
      *
      * @return A new, composite future of the result of this reduce.
      */
-    public Future notifyReduce( Future lhs, Future rhs ) {
-        TreeReduceTask reducer = new TreeReduceTask(new TreeReducer(this, lhs, rhs));
+    public Future notifyReduce( final Future lhs, final Future rhs ) {
+        final TreeReduceTask reducer = new TreeReduceTask(new TreeReducer(this, lhs, rhs));
         reduceTasks.add(reducer);
         return reducer;
     }
 
-    /**
-     * Detects whether an execution error has occurred.
-     * @return True if an error has occurred.  False otherwise.
-     */
-    private synchronized boolean hasTraversalErrorOccurred() {
-        return error != null;
-    }
-
-    private synchronized RuntimeException getTraversalError() {
-        if(!hasTraversalErrorOccurred())
-            throw new ReviewedStingException("User has attempted to retrieve a traversal error when none exists");
-        return error;
-    }
-
     /**
      * Allows other threads to notify of an error during traversal.
      */
     protected synchronized RuntimeException notifyOfTraversalError(Throwable error) {
-        // If the error is already a Runtime, pass it along as is.  Otherwise, wrap it.
-        this.error = toRuntimeException(error);
-        return this.error;
+        return errorTracker.notifyOfError(error);
     }
 
-    private final RuntimeException toRuntimeException(final Throwable error) {
-        // If the error is already a Runtime, pass it along as is.  Otherwise, wrap it.
-        if (error instanceof RuntimeException)
-            return (RuntimeException)error;
-        else
-            return new ReviewedStingException("An error occurred during the traversal.  Message=" + error.getMessage(), error);
-    }
-
-
     /** A small wrapper class that provides the TreeReducer interface along with the FutureTask semantics. */
     private class TreeReduceTask extends FutureTask {
-        private TreeReducer treeReducer = null;
+        final private TreeReducer treeReducer;
 
         public TreeReduceTask( TreeReducer treeReducer ) {
             super(treeReducer);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java
index 530285db0..87d0ad721 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/HierarchicalMicroSchedulerMBean.java
@@ -16,7 +16,7 @@ package org.broadinstitute.sting.gatk.executive;
  * An interface for retrieving runtime statistics about how the hierarchical
  * microscheduler is behaving. 
  */
-public interface HierarchicalMicroSchedulerMBean extends MicroSchedulerMBean {
+public interface HierarchicalMicroSchedulerMBean {
     /**
      * How many tree reduces are waiting in the tree reduce queue?
      * @return Total number of reduces waiting in the tree reduce queue?
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
index b35abb775..f3c1ae91c 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java
@@ -10,9 +10,12 @@ import org.broadinstitute.sting.gatk.datasources.reads.Shard;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
 import org.broadinstitute.sting.gatk.io.OutputTracker;
+import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
+import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
 import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions;
 import org.broadinstitute.sting.gatk.walkers.Walker;
 import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
 
 import java.util.Collection;
 
@@ -33,8 +36,16 @@ public class LinearMicroScheduler extends MicroScheduler {
      * @param reference Reference for driving the traversal.
      * @param rods      Reference-ordered data.
      */
-    protected LinearMicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods ) {
-        super(engine, walker, reads, reference, rods);
+    protected LinearMicroScheduler(final GenomeAnalysisEngine engine,
+                                   final Walker walker,
+                                   final SAMDataSource reads,
+                                   final IndexedFastaSequenceFile reference,
+                                   final Collection rods,
+                                   final ThreadAllocation threadAllocation) {
+        super(engine, walker, reads, reference, rods, threadAllocation);
+
+        if ( threadAllocation.monitorThreadEfficiency() )
+            setThreadEfficiencyMonitor(new ThreadEfficiencyMonitor());
     }
 
     /**
@@ -49,11 +60,12 @@ public class LinearMicroScheduler extends MicroScheduler {
 
         boolean done = walker.isDone();
         int counter = 0;
+
+        final TraversalEngine traversalEngine = borrowTraversalEngine(this);
         for (Shard shard : shardStrategy ) {
-            if ( done || shard == null ) // we ran out of shards that aren't owned
+            if ( abortExecution() || done || shard == null ) // we ran out of shards that aren't owned
                 break;
 
-            traversalEngine.startTimersIfNecessary();
             if(shard.getShardType() == Shard.ShardType.LOCUS) {
                 WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(),
                         getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine));
@@ -84,10 +96,10 @@ public class LinearMicroScheduler extends MicroScheduler {
                 
         Object result = accumulator.finishTraversal();
 
-        printOnTraversalDone(result);
-
         outputTracker.close();
+        returnTraversalEngine(this, traversalEngine);
         cleanup();
+        executionIsDone();
 
         return accumulator;
     }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
index 95e39b7c6..38170040a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java
@@ -25,9 +25,11 @@
 
 package org.broadinstitute.sting.gatk.executive;
 
+import com.google.java.contract.Ensures;
 import net.sf.picard.reference.IndexedFastaSequenceFile;
 import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.ReadMetrics;
 import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource;
 import org.broadinstitute.sting.gatk.datasources.reads.Shard;
 import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
@@ -37,14 +39,21 @@ import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
 import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
 import org.broadinstitute.sting.gatk.traversals.*;
 import org.broadinstitute.sting.gatk.walkers.*;
+import org.broadinstitute.sting.utils.AutoFormattingTime;
+import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler;
+import org.broadinstitute.sting.utils.progressmeter.ProgressMeter;
+import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
 
 import javax.management.JMException;
 import javax.management.MBeanServer;
 import javax.management.ObjectName;
+import java.io.File;
 import java.lang.management.ManagementFactory;
-import java.util.Collection;
+import java.util.*;
+import java.util.concurrent.TimeUnit;
 
 
 /**
@@ -54,12 +63,39 @@ import java.util.Collection;
  * Time: 12:37:23 PM
  *
  * General base class for all scheduling algorithms
+ * Shards and schedules data in manageable chunks.
+ *
+ * Creates N TraversalEngines for each data thread for the MicroScheduler.  This is necessary
+ * because in the HMS case you have multiple threads executing a traversal engine independently, and
+ * these engines may need to create separate resources for efficiency or implementation reasons.  For example,
+ * the nanoScheduler creates threads to implement the traversal, and this creation is instance specific.
+ * So each HMS thread needs to have it's own distinct copy of the traversal engine if it wants to have
+ * N data threads x M nano threads => N * M threads total.  These are borrowed from this microscheduler
+ * and returned when done.  Also allows us to tracks all created traversal engines so this microscheduler
+ * can properly shut them all down when the scheduling is done.
+ *
  */
-
-/** Shards and schedules data in manageable chunks. */
 public abstract class MicroScheduler implements MicroSchedulerMBean {
     protected static final Logger logger = Logger.getLogger(MicroScheduler.class);
 
+    /**
+     * The list of all Traversal engines we've created in this micro scheduler
+     */
+    final List allCreatedTraversalEngines = new LinkedList();
+
+    /**
+     * All available engines.  Engines are borrowed and returned when a subclass is actually
+     * going to execute the engine on some data.  This allows us to have N copies for
+     * N data parallel executions, but without the dangerous code of having local
+     * ThreadLocal variables.
+     */
+    final LinkedList availableTraversalEngines = new LinkedList();
+
+    /**
+     * Engines that have been allocated to a key already.
+     */
+    final HashMap allocatedTraversalEngines = new HashMap();
+
     /**
      * Counts the number of instances of the class that are currently alive.
      */
@@ -70,7 +106,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
      */
     protected final GenomeAnalysisEngine engine;
 
-    protected final TraversalEngine traversalEngine;
     protected final IndexedFastaSequenceFile reference;
 
     private final SAMDataSource reads;
@@ -79,6 +114,15 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
     private final MBeanServer mBeanServer;
     private final ObjectName mBeanName;
 
+    /**
+     * Threading efficiency monitor for tracking the resource utilization of the GATK
+     *
+     * may be null
+     */
+    ThreadEfficiencyMonitor threadEfficiencyMonitor = null;
+
+    final ProgressMeter progressMeter;
+
     /**
      * MicroScheduler factory function.  Create a microscheduler appropriate for reducing the
      * selected walker.
@@ -92,18 +136,45 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
      * @return The best-fit microscheduler.
      */
     public static MicroScheduler create(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods, ThreadAllocation threadAllocation) {
-        if (walker instanceof TreeReducible && threadAllocation.getNumCPUThreads() > 1) {
-            if(walker.isReduceByInterval())
-                throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval.  Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution.  Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
-            if(walker instanceof ReadWalker)
-                throw new UserException.BadArgumentValue("nt", String.format("The analysis %s is a read walker.  Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution.  Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
-            logger.info(String.format("Running the GATK in parallel mode with %d concurrent threads",threadAllocation.getNumCPUThreads()));
-            return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation.getNumCPUThreads());
-        } else {
-            if(threadAllocation.getNumCPUThreads() > 1)
-                throw new UserException.BadArgumentValue("nt", String.format("The analysis %s currently does not support parallel execution.  Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
-            return new LinearMicroScheduler(engine, walker, reads, reference, rods);
+        if ( threadAllocation.isRunningInParallelMode() ) {
+            logger.info(String.format("Running the GATK in parallel mode with %d total threads, " +
+                    "%d CPU thread(s) for each of %d data thread(s), of %d processors available on this machine",
+                    threadAllocation.getTotalNumThreads(),
+                    threadAllocation.getNumCPUThreadsPerDataThread(),
+                    threadAllocation.getNumDataThreads(),
+                    Runtime.getRuntime().availableProcessors()));
+            if ( threadAllocation.getTotalNumThreads() > Runtime.getRuntime().availableProcessors() )
+                logger.warn(String.format("Number of requested GATK threads %d is more than the number of " +
+                        "available processors on this machine %d", threadAllocation.getTotalNumThreads(),
+                        Runtime.getRuntime().availableProcessors()));
+//            if ( threadAllocation.getNumDataThreads() > 1 && threadAllocation.getNumCPUThreadsPerDataThread() > 1)
+//                throw new UserException("The GATK currently doesn't support running with both -nt > 1 and -nct > 1");
         }
+
+        if ( threadAllocation.getNumDataThreads() > 1 ) {
+            if (walker.isReduceByInterval())
+                throw new UserException.BadArgumentValue("nt", String.format("The analysis %s aggregates results by interval.  Due to a current limitation of the GATK, analyses of this type do not currently support parallel execution.  Please run your analysis without the -nt option.", engine.getWalkerName(walker.getClass())));
+
+            if ( ! (walker instanceof TreeReducible) ) {
+                throw badNT("nt", engine, walker);
+            }
+        }
+
+        if ( threadAllocation.getNumCPUThreadsPerDataThread() > 1 && ! (walker instanceof NanoSchedulable) ) {
+            throw badNT("nct", engine, walker);
+        }
+
+        if ( threadAllocation.getNumDataThreads() > 1 ) {
+            return new HierarchicalMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
+        } else {
+            return new LinearMicroScheduler(engine, walker, reads, reference, rods, threadAllocation);
+        }
+    }
+
+    private static UserException badNT(final String parallelArg, final GenomeAnalysisEngine engine, final Walker walker) {
+        throw new UserException.BadArgumentValue(parallelArg,
+                String.format("The analysis %s currently does not support parallel execution with %s.  " +
+                        "Please run your analysis without the %s option.", engine.getWalkerName(walker.getClass()), parallelArg, parallelArg));
     }
 
     /**
@@ -113,28 +184,37 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
      * @param reads   The reads.
      * @param reference The reference.
      * @param rods    the rods to include in the traversal
+     * @param threadAllocation the allocation of threads to use in the underlying traversal
      */
-    protected MicroScheduler(GenomeAnalysisEngine engine, Walker walker, SAMDataSource reads, IndexedFastaSequenceFile reference, Collection rods) {
+    protected MicroScheduler(final GenomeAnalysisEngine engine,
+                             final Walker walker,
+                             final SAMDataSource reads,
+                             final IndexedFastaSequenceFile reference,
+                             final Collection rods,
+                             final ThreadAllocation threadAllocation) {
         this.engine = engine;
         this.reads = reads;
         this.reference = reference;
         this.rods = rods;
 
-        if (walker instanceof ReadWalker) {
-            traversalEngine = new TraverseReads();
-        } else if (walker instanceof LocusWalker) {
-            traversalEngine = new TraverseLoci();
-        } else if (walker instanceof DuplicateWalker) {
-            traversalEngine = new TraverseDuplicates();
-        } else if (walker instanceof ReadPairWalker) {
-            traversalEngine = new TraverseReadPairs();
-        } else if (walker instanceof ActiveRegionWalker) {
-            traversalEngine = new TraverseActiveRegions();
-        } else {
-            throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
-        }        
+        final File progressLogFile = engine.getArguments() == null ? null : engine.getArguments().performanceLog;
 
-        traversalEngine.initialize(engine);
+        // Creates uninitialized TraversalEngines appropriate for walker and threadAllocation,
+        // and adds it to the list of created engines for later shutdown.
+        for ( int i = 0; i < threadAllocation.getNumDataThreads(); i++ ) {
+            final TraversalEngine traversalEngine = createTraversalEngine(walker, threadAllocation);
+            allCreatedTraversalEngines.add(traversalEngine);
+            availableTraversalEngines.add(traversalEngine);
+        }
+
+        // Create our progress meter
+        this.progressMeter = new ProgressMeter(progressLogFile,
+                availableTraversalEngines.peek().getTraversalUnits(),
+                engine.getRegionsOfGenomeBeingProcessed());
+
+        // Now that we have a progress meter, go through and initialize the traversal engines
+        for ( final TraversalEngine traversalEngine : allCreatedTraversalEngines )
+            traversalEngine.initialize(engine, progressMeter);
 
         // JMX does not allow multiple instances with the same ObjectName to be registered with the same platform MXBean.
         // To get around this limitation and since we have no job identifier at this point, register a simple counter that
@@ -150,6 +230,67 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
         }
     }
 
+    /**
+     * Really make us a traversal engine of the appropriate type for walker and thread allocation
+     *
+     * @return a non-null uninitialized traversal engine
+     */
+    @Ensures("result != null")
+    private TraversalEngine createTraversalEngine(final Walker walker, final ThreadAllocation threadAllocation) {
+        if (walker instanceof ReadWalker) {
+            return new TraverseReadsNano(threadAllocation.getNumCPUThreadsPerDataThread());
+        } else if (walker instanceof LocusWalker) {
+            return new TraverseLociNano(threadAllocation.getNumCPUThreadsPerDataThread());
+        } else if (walker instanceof DuplicateWalker) {
+            return new TraverseDuplicates();
+        } else if (walker instanceof ReadPairWalker) {
+            return new TraverseReadPairs();
+        } else if (walker instanceof ActiveRegionWalker) {
+            return new TraverseActiveRegions();
+        } else {
+            throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type.");
+        }
+    }
+
+
+    /**
+     * Return the ThreadEfficiencyMonitor we are using to track our resource utilization, if there is one
+     *
+     * @return the monitor, or null if none is active
+     */
+    public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() {
+        return threadEfficiencyMonitor;
+    }
+
+    /**
+     * Inform this Microscheduler to use the efficiency monitor used to create threads in subclasses
+     *
+     * @param threadEfficiencyMonitor
+     */
+    public void setThreadEfficiencyMonitor(final ThreadEfficiencyMonitor threadEfficiencyMonitor) {
+        this.threadEfficiencyMonitor = threadEfficiencyMonitor;
+    }
+
+    /**
+     * Should we stop all execution work and exit gracefully?
+     *
+     * Returns true in the case where some external signal or time limit has been received, indicating
+     * that this GATK shouldn't continue executing.  This isn't a kill signal, it is really a "shutdown
+     * gracefully at the next opportunity" signal.  Concrete implementations of the MicroScheduler
+     * examine this value as often as reasonable and, if it returns true, stop what they are doing
+     * at the next available opportunity, shutdown their resources, call notify done, and return.
+     *
+     * @return true if we should abort execution, or false otherwise
+     */
+    protected boolean abortExecution() {
+        final boolean abort = engine.exceedsRuntimeLimit(progressMeter.getRuntimeInNanoseconds(), TimeUnit.NANOSECONDS);
+        if ( abort ) {
+            final AutoFormattingTime aft = new AutoFormattingTime(TimeUnit.SECONDS.convert(engine.getRuntimeLimitInNanoseconds(), TimeUnit.NANOSECONDS), 1, 4);
+            logger.info("Aborting execution (cleanly) because the runtime has exceeded the requested maximum " + aft);
+        }
+        return abort;
+    }
+
     /**
      * Walks a walker over the given list of intervals.
      *
@@ -176,11 +317,71 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
     }
 
     /**
-     * Print summary information for the analysis.
-     * @param sum The final reduce output.
+     * Must be called by subclasses when execute is done
      */
-    protected void printOnTraversalDone(Object sum) {
-        traversalEngine.printOnTraversalDone();
+    protected void executionIsDone() {
+        progressMeter.notifyDone(engine.getCumulativeMetrics().getNumIterations());
+        printReadFilteringStats();
+        shutdownTraversalEngines();
+
+        // Print out the threading efficiency of this HMS, if state monitoring is enabled
+        if ( threadEfficiencyMonitor != null ) {
+            // include the master thread information
+            threadEfficiencyMonitor.threadIsDone(Thread.currentThread());
+            threadEfficiencyMonitor.printUsageInformation(logger);
+        }
+    }
+
+    /**
+     * Shutdown all of the created engines, and clear the list of created engines, dropping
+     * pointers to the traversal engines
+     */
+    public synchronized void shutdownTraversalEngines() {
+        // no longer applicable because engines are allocated to keys now
+//        if ( availableTraversalEngines.size() != allCreatedTraversalEngines.size() )
+//            throw new IllegalStateException("Shutting down TraversalEngineCreator but not all engines " +
+//                    "have been returned.  Expected " + allCreatedTraversalEngines.size() + " but only " + availableTraversalEngines.size()
+//                    + " have been returned");
+
+        for ( final TraversalEngine te : allCreatedTraversalEngines)
+            te.shutdown();
+
+        // horrible hack to print nano scheduling information across all nano schedulers, if any were used
+        NanoScheduler.printCombinedRuntimeProfile();
+
+        allCreatedTraversalEngines.clear();
+        availableTraversalEngines.clear();
+    }
+
+    /**
+     * Prints out information about number of reads observed and filtering, if any reads were used in the traversal
+     *
+     * Looks like:
+     *
+     * INFO  10:40:47,370 MicroScheduler - 22 reads were filtered out during traversal out of 101 total (21.78%)
+     * INFO  10:40:47,370 MicroScheduler -   -> 1 reads (0.99% of total) failing BadMateFilter
+     * INFO  10:40:47,370 MicroScheduler -   -> 20 reads (19.80% of total) failing DuplicateReadFilter
+     * INFO  10:40:47,370 MicroScheduler -   -> 1 reads (0.99% of total) failing FailsVendorQualityCheckFilter
+     */
+    private void printReadFilteringStats() {
+        final ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics();
+        if ( cumulativeMetrics.getNumReadsSeen() > 0 ) {
+            // count up the number of skipped reads by summing over all filters
+            long nSkippedReads = 0L;
+            for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values())
+                nSkippedReads += countsByFilter;
+
+            logger.info(String.format("%d reads were filtered out during traversal out of %d total (%.2f%%)",
+                    nSkippedReads,
+                    cumulativeMetrics.getNumReadsSeen(),
+                    100.0 * MathUtils.ratio(nSkippedReads, cumulativeMetrics.getNumReadsSeen())));
+
+            for ( final Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) {
+                long count = filterCounts.getValue();
+                logger.info(String.format("  -> %d reads (%.2f%% of total) failing %s",
+                        count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey()));
+            }
+        }
     }
 
     /**
@@ -201,38 +402,6 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
      */
     public IndexedFastaSequenceFile getReference() { return reference; }
 
-    /**
-     * Gets the filename to which performance data is currently being written.
-     * @return Filename to which performance data is currently being written.
-     */
-    public String getPerformanceLogFileName() {
-        return traversalEngine.getPerformanceLogFileName();
-    }
-
-    /**
-     * Set the filename of the log for performance.  If set,
-     * @param fileName filename to use when writing performance data.
-     */
-    public void setPerformanceLogFileName(String fileName) {
-        traversalEngine.setPerformanceLogFileName(fileName);
-    }
-
-    /**
-     * Gets the frequency with which performance data is written.
-     * @return Frequency, in seconds, of performance log writes.
-     */
-    public long getPerformanceProgressPrintFrequencySeconds() {
-        return traversalEngine.getPerformanceProgressPrintFrequencySeconds();
-    }    
-
-    /**
-     * How often should the performance log message be written?
-     * @param seconds number of seconds between messages indicating performance frequency.
-     */
-    public void setPerformanceProgressPrintFrequencySeconds(long seconds) {
-        traversalEngine.setPerformanceProgressPrintFrequencySeconds(seconds);
-    }
-
     protected void cleanup() {
         try {
             mBeanServer.unregisterMBean(mBeanName);
@@ -241,4 +410,58 @@ public abstract class MicroScheduler implements MicroSchedulerMBean {
             throw new ReviewedStingException("Unable to unregister microscheduler with JMX", ex);
         }
     }
+
+    /**
+     * Returns a traversal engine suitable for use, associated with key
+     *
+     * Key is an arbitrary object that is used to retrieve the same traversal
+     * engine over and over.  This can be important in the case where the
+     * traversal engine has data associated with it in some other context,
+     * and we need to ensure that the context always sees the same traversal
+     * engine.  This happens in the HierarchicalMicroScheduler, where you want
+     * the a thread executing traversals to retrieve the same engine each time,
+     * as outputs are tracked w.r.t. that engine.
+     *
+     * If no engine is associated with key yet, pops the next available engine
+     * from the available ones maintained by this
+     * microscheduler.  Note that it's a runtime error to pop a traversal engine
+     * from this scheduler if there are none available.  Callers that
+     * once pop'd an engine for use must return it with returnTraversalEngine
+     *
+     * @param key the key to associate with this engine
+     * @return a non-null TraversalEngine suitable for execution in this scheduler
+     */
+    @Ensures("result != null")
+    protected synchronized TraversalEngine borrowTraversalEngine(final Object key) {
+        if ( key == null ) throw new IllegalArgumentException("key cannot be null");
+
+        final TraversalEngine engine = allocatedTraversalEngines.get(key);
+        if ( engine == null ) {
+            if ( availableTraversalEngines.isEmpty() )
+                throw new IllegalStateException("no traversal engines were available");
+            allocatedTraversalEngines.put(key, availableTraversalEngines.pop());
+            return allocatedTraversalEngines.get(key);
+        } else {
+            return engine;
+        }
+    }
+
+    /**
+     * Return a borrowed traversal engine to this MicroScheduler, for later use
+     * in another traversal execution
+     *
+     * @param key the key used to id the engine, provided to the borrowTraversalEngine function
+     * @param traversalEngine the borrowed traversal engine.  Must have been previously borrowed.
+     */
+    protected synchronized void returnTraversalEngine(final Object key, final TraversalEngine traversalEngine) {
+        if ( traversalEngine == null )
+            throw new IllegalArgumentException("Attempting to push a null traversal engine");
+        if ( ! allCreatedTraversalEngines.contains(traversalEngine) )
+            throw new IllegalArgumentException("Attempting to push a traversal engine not created by this MicroScheduler" + engine);
+        if ( ! allocatedTraversalEngines.containsKey(key) )
+            throw new IllegalArgumentException("No traversal engine was never checked out with key " + key);
+
+        // note there's nothing to actually do here, but a function implementation
+        // might want to do something
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java
index e510822b8..8be6b0b62 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroSchedulerMBean.java
@@ -31,27 +31,5 @@ package org.broadinstitute.sting.gatk.executive;
  * To change this template use File | Settings | File Templates.
  */
 public interface MicroSchedulerMBean {
-    /**
-     * Gets the filename to which performance data is currently being written.
-     * @return Filename to which performance data is currently being written.
-     */
-    public String getPerformanceLogFileName();
-
-    /**
-     * Set the filename of the log for performance.  If set,
-     * @param fileName filename to use when writing performance data.
-     */
-    public void setPerformanceLogFileName(String fileName);
-
-    /**
-     * Gets the frequency with which performance data is written.
-     * @return Frequency, in seconds, of performance log writes.
-     */
-    public long getPerformanceProgressPrintFrequencySeconds();    
-
-    /**
-     * How often should the performance log message be written?
-     * @param seconds number of seconds between messages indicating performance frequency.
-     */
-    public void setPerformanceProgressPrintFrequencySeconds(long seconds);
+    // has nothing because we don't have anything we currently track
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java
index aefa9c12d..d9a694846 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/ShardTraverser.java
@@ -4,13 +4,13 @@ import org.apache.log4j.Logger;
 import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider;
 import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
 import org.broadinstitute.sting.gatk.datasources.reads.Shard;
-import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
+import org.broadinstitute.sting.gatk.io.ThreadGroupOutputTracker;
 import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
 import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 
 import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
 /**
  * User: hanna
  * Date: Apr 29, 2009
@@ -30,8 +30,7 @@ public class ShardTraverser implements Callable {
     final private HierarchicalMicroScheduler microScheduler;
     final private Walker walker;
     final private Shard shard;
-    final private TraversalEngine traversalEngine;
-    final private ThreadLocalOutputTracker outputTracker;
+    final private ThreadGroupOutputTracker outputTracker;
     private OutputMergeTask outputMergeTask;
 
     /** our log, which we want to capture anything from this class */
@@ -43,22 +42,26 @@ public class ShardTraverser implements Callable {
     private boolean complete = false;
 
     public ShardTraverser( HierarchicalMicroScheduler microScheduler,
-                           TraversalEngine traversalEngine,
                            Walker walker,
                            Shard shard,
-                           ThreadLocalOutputTracker outputTracker) {
+                           ThreadGroupOutputTracker outputTracker) {
         this.microScheduler = microScheduler;
         this.walker = walker;
-        this.traversalEngine = traversalEngine;
         this.shard = shard;
         this.outputTracker = outputTracker;
     }
 
     public Object call() {
+        final Object traversalEngineKey = Thread.currentThread();
+        final TraversalEngine traversalEngine = microScheduler.borrowTraversalEngine(traversalEngineKey);
+
         try {
-            traversalEngine.startTimersIfNecessary();
             final long startTime = System.currentTimeMillis();
 
+            // this is CRITICAL -- initializes output maps in this master thread,
+            // so that any subthreads created by the traversal itself can access this map
+            outputTracker.initializeStorage();
+
             Object accumulator = walker.reduceInit();
             final WindowMaker windowMaker = new WindowMaker(shard,microScheduler.getEngine().getGenomeLocParser(),
                     microScheduler.getReadIterator(shard),
@@ -67,7 +70,7 @@ public class ShardTraverser implements Callable {
 
             for(WindowMaker.WindowMakerIterator iterator: windowMaker) {
                 final ShardDataProvider dataProvider = new LocusShardDataProvider(shard,iterator.getSourceInfo(),microScheduler.getEngine().getGenomeLocParser(),iterator.getLocus(),iterator,microScheduler.reference,microScheduler.rods);
-                accumulator = traversalEngine.traverse( walker, dataProvider, accumulator );
+                accumulator = traversalEngine.traverse(walker, dataProvider, accumulator);
                 dataProvider.close();
             }
 
@@ -85,11 +88,20 @@ public class ShardTraverser implements Callable {
         } finally {
             synchronized(this) {
                 complete = true;
+                microScheduler.returnTraversalEngine(traversalEngineKey, traversalEngine);
                 notifyAll();
             }
         }
     }
 
+    /**
+     * Return a human readable string describing the intervals this traverser is operating on
+     * @return
+     */
+    public String getIntervalsString() {
+        return Utils.join(",", shard.getGenomeLocs());
+    }
+
     /**
      * Has this traversal completed?
      * @return True if completed, false otherwise.
diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
index da11d36dd..6c0dc9769 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/executive/WindowMaker.java
@@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.datasources.reads.Shard;
 import org.broadinstitute.sting.gatk.iterators.LocusIterator;
 import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState;
+import org.broadinstitute.sting.gatk.iterators.LocusIteratorByStateExperimental;
 import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
@@ -81,7 +82,13 @@ public class WindowMaker implements Iterable, I
     public WindowMaker(Shard shard, GenomeLocParser genomeLocParser, StingSAMIterator iterator, List intervals, Collection sampleNames) {
         this.sourceInfo = shard.getReadProperties();
         this.readIterator = iterator;
-        this.sourceIterator = new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
+
+        // Temporary: use the experimental version of LocusIteratorByState if experimental downsampling was requested:
+        this.sourceIterator = sourceInfo.getDownsamplingMethod().useExperimentalDownsampling ?
+                              new PeekableIterator(new LocusIteratorByStateExperimental(iterator,sourceInfo,genomeLocParser, sampleNames))
+                              :
+                              new PeekableIterator(new LocusIteratorByState(iterator,sourceInfo,genomeLocParser, sampleNames));
+
         this.intervalIterator = intervals.size()>0 ? new PeekableIterator(intervals.iterator()) : null;
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java b/public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java
index 8596e18eb..b3c84511a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/BadMateFilter.java
@@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.filters;
 import net.sf.samtools.SAMRecord;
 
 /**
- * Filter out reads with low mapping qualities.
+ * Filter out reads whose mate maps to a different contig.
  *
  * @author ebanks
  * @version 0.1
diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java
index 67f82235d..5ca8a1779 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java
@@ -25,9 +25,14 @@
 
 package org.broadinstitute.sting.gatk.filters;
 
+import com.google.common.base.Function;
+import com.google.common.collect.Collections2;
+import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.classloader.PluginManager;
+import org.broadinstitute.sting.utils.help.GATKDocUtils;
 
 import java.util.Collection;
+import java.util.List;
 
 /**
  * Manage filters and filter options.  Any requests for basic filtering classes
@@ -54,4 +59,39 @@ public class FilterManager extends PluginManager {
     public Collection> getValues() {
         return this.getPlugins();
     }
+
+    /**
+     * Rather than use the default error message, print out a list of read filters as well.
+     * @param pluginCategory - string, the category of the plugin (e.g. read filter)
+     * @param pluginName - string, what we were trying to match (but failed to)
+     * @return - A wall of text with the default message, followed by a listing of available read filters
+     */
+    @Override
+    protected String formatErrorMessage(String pluginCategory, String pluginName) {
+        List> availableFilters = this.getPluginsImplementing(ReadFilter.class);
+
+
+        return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName,
+                userFriendlyListofReadFilters(availableFilters),
+                "Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information.");
+    }
+
+    private String userFriendlyListofReadFilters(List> filters) {
+        final String headName = "FilterName", headDoc = "Documentation";
+        int longestNameLength = -1;
+        for ( Class < ? extends ReadFilter> filter : filters ) {
+            longestNameLength = Math.max(longestNameLength,this.getName(filter).length());
+        }
+        String format = "   %"+longestNameLength+"s        %s%n";
+
+        StringBuilder listBuilder = new StringBuilder();
+        listBuilder.append(String.format(format,headName,headDoc));
+        for ( Class filter : filters ) {
+            String helpLink = GATKDocUtils.helpLinksToGATKDocs(filter);
+            String filterName = this.getName(filter);
+            listBuilder.append(String.format(format,filterName,helpLink));
+        }
+
+        return listBuilder.toString();
+    }
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java
new file mode 100644
index 000000000..fdfe494a7
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadGroupOutputTracker.java
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.io;
+
+import org.broadinstitute.sting.gatk.executive.OutputMergeTask;
+import org.broadinstitute.sting.gatk.io.storage.Storage;
+import org.broadinstitute.sting.gatk.io.storage.StorageFactory;
+import org.broadinstitute.sting.gatk.io.stubs.Stub;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * An output tracker that can either track its output per-thread or directly.
+ *
+ * This output tracker doesn't use thread local values, but rather looks up the
+ * storage map via the thread's group.  This is necessary in the case where
+ * there's a master thread that creates the output map, and spawns subthreads
+ * that actually do work.  As long as those subthreads are spawned in the
+ * thread group of the master thread, this tracker will properly find the
+ * storage map associated with the master thread in the group, and return
+ * the map to all subthreads.
+ *
+ * @author mhanna, depristo
+ * @version 0.2
+ */
+public class ThreadGroupOutputTracker extends OutputTracker {
+    /**
+     * A map from thread ID of the master thread to the storage map from
+     * Stub to Storage objects
+     */
+    private Map> threadsToStorage = new HashMap>();
+
+    /**
+     * A total hack.  If bypass = true, bypass thread local storage and write directly
+     * to the target file.  Used to handle output during initialize() and onTraversalDone().
+     */
+    private boolean bypass = false;
+    public void bypassThreadLocalStorage(boolean bypass) {
+        this.bypass = bypass;
+    }
+
+    /**
+     * Initialize the storage map for this thread.
+     *
+     * Checks if there's a thread local binding for this thread, and if
+     * not initializes the map for it.  This map is then
+     * populated with stub -> storage bindings according to the
+     * superclasses' outputs map.
+     *
+     * Must be called within the master thread to create a map associated with
+     * the master thread ID.
+     */
+    public synchronized void initializeStorage() {
+        final ThreadGroup group = Thread.currentThread().getThreadGroup();
+        Map threadLocalOutputStreams = threadsToStorage.get(group);
+
+        if( threadLocalOutputStreams == null ) {
+            threadLocalOutputStreams = new HashMap();
+            threadsToStorage.put( group, threadLocalOutputStreams );
+        }
+
+        for ( final Stub stub : outputs.keySet() ) {
+            final Storage target = StorageFactory.createStorage(stub, createTempFile(stub));
+            threadLocalOutputStreams.put(stub, target);
+        }
+    }
+
+    @Override
+    public  T getStorage( final Stub stub ) {
+        Storage target;
+
+        if (bypass) {
+            target = outputs.get(stub);
+            if( target == null ) {
+                target = StorageFactory.createStorage(stub);
+                outputs.put(stub, target);
+            }
+        }
+        else {
+            final Map threadLocalOutputStreams = findStorage(Thread.currentThread());
+            target = threadLocalOutputStreams.get(stub);
+
+            // make sure something hasn't gone wrong, and we somehow find a map that doesn't include our stub
+            if ( target == null )
+                throw new ReviewedStingException("target isn't supposed to be null for " + Thread.currentThread()
+                        + " id " + Thread.currentThread().getId() + " map is " + threadLocalOutputStreams);
+        }
+
+        return (T)target;
+    }
+
+
+    private synchronized Map findStorage(final Thread thread) {
+        final Map map = threadsToStorage.get(thread.getThreadGroup());
+
+        if ( map != null ) {
+            return map;
+        } else {
+            // something is terribly wrong, we have a storage lookup for a thread that doesn't have
+            // any map data associated with it!
+            throw new ReviewedStingException("Couldn't find storage map associated with thread " + thread + " in group " + thread.getThreadGroup());
+        }
+    }
+
+    /**
+     * Close down any existing temporary files which have been opened.
+     */
+    public synchronized OutputMergeTask closeStorage() {
+        final Map threadLocalOutputStreams = findStorage(Thread.currentThread());
+
+        if( threadLocalOutputStreams == null || threadLocalOutputStreams.isEmpty() )
+            return null;
+
+        final OutputMergeTask outputMergeTask = new OutputMergeTask();
+        for( Map.Entry entry: threadLocalOutputStreams.entrySet() ) {
+            final Stub stub = entry.getKey();
+            final Storage storageEntry = entry.getValue();
+
+            storageEntry.close();
+            outputMergeTask.addMergeOperation(getTargetStream(stub), storageEntry);
+        }
+
+//        logger.info("Closing " + Thread.currentThread().getId() + " => " + threadLocalOutputStreams);
+        threadLocalOutputStreams.clear();
+
+        return outputMergeTask;
+    }
+
+    /**
+     * Creates a temporary file for a stub of the given type.
+     * @param stub Stub for which to create a temporary file.
+     * @param  Type of the stub to accept.
+     * @return A temp file, or throw an exception if the temp file cannot be created.
+     */
+    private  File createTempFile( Stub stub ) {
+        try {
+            return File.createTempFile( stub.getClass().getName(), null );
+        } catch( IOException ex ) {
+            throw new UserException.BadTmpDir("Unable to create temporary file for stub: " + stub.getClass().getName() );
+        }
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java b/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java
deleted file mode 100644
index 636787c69..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/io/ThreadLocalOutputTracker.java
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2009 The Broad Institute
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.io;
-
-import org.broadinstitute.sting.gatk.executive.OutputMergeTask;
-import org.broadinstitute.sting.gatk.io.storage.Storage;
-import org.broadinstitute.sting.gatk.io.storage.StorageFactory;
-import org.broadinstitute.sting.gatk.io.stubs.Stub;
-import org.broadinstitute.sting.utils.exceptions.UserException;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.Map;
-
-/**
- * An output tracker that can either track its output per-thread or directly,
- *
- * @author mhanna
- * @version 0.1
- */
-public class ThreadLocalOutputTracker extends OutputTracker {
-    /**
-     * Thread-local storage for output streams.
-     */
-    private ThreadLocal> storage = new ThreadLocal>();
-
-    /**
-     * A total hack.  If bypass = true, bypass thread local storage and write directly
-     * to the target file.  Used to handle output during initialize() and onTraversalDone().
-     */
-    private boolean bypass = false;
-    public void bypassThreadLocalStorage(boolean bypass) {
-        this.bypass = bypass;
-    }
-
-    public  T getStorage( Stub stub ) {
-        Storage target;
-
-        if(bypass) {
-            target = outputs.get(stub);
-            if( target == null ) {
-                target = StorageFactory.createStorage(stub);
-                outputs.put(stub, target);
-            }
-        }
-        else {
-            Map threadLocalOutputStreams = storage.get();
-
-            if( threadLocalOutputStreams == null ) {
-                threadLocalOutputStreams = new HashMap();
-                storage.set( threadLocalOutputStreams );
-            }
-
-            target = threadLocalOutputStreams.get(stub);
-            if( target == null ) {
-                target = StorageFactory.createStorage(stub, createTempFile(stub));
-                threadLocalOutputStreams.put(stub, target);
-            }
-        }
-
-        return (T)target;
-    }
-
-    /**
-     * Close down any existing temporary files which have been opened.
-     */
-    public OutputMergeTask closeStorage() {
-        Map threadLocalOutputStreams = storage.get();
-
-        if( threadLocalOutputStreams == null || threadLocalOutputStreams.isEmpty() )
-            return null;
-
-        OutputMergeTask outputMergeTask = new OutputMergeTask();
-        for( Map.Entry entry: threadLocalOutputStreams.entrySet() ) {
-            Stub stub = entry.getKey();
-            Storage storageEntry = entry.getValue();
-
-            storageEntry.close();
-            outputMergeTask.addMergeOperation(getTargetStream(stub),storageEntry);            
-        }
-        
-        threadLocalOutputStreams.clear();
-
-        return outputMergeTask;
-    }
-
-    /**
-     * Creates a temporary file for a stub of the given type.
-     * @param stub Stub for which to create a temporary file.
-     * @param  Type of the stub to accept.
-     * @return A temp file, or throw an exception if the temp file cannot be created.
-     */
-    private  File createTempFile( Stub stub ) {
-        File tempFile = null;
-
-        try {
-            tempFile = File.createTempFile( stub.getClass().getName(), null );
-            //tempFile.deleteOnExit();
-        }
-        catch( IOException ex ) {
-            throw new UserException.BadTmpDir("Unable to create temporary file for stub: " + stub.getClass().getName() );
-        }
-
-        return tempFile;
-    }
-}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java
index 300e801e6..9f69a4144 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/SAMFileWriterStorage.java
@@ -50,7 +50,7 @@ public class SAMFileWriterStorage implements SAMFileWriter, Storage codec = fd.getCodec();
             final AbstractFeatureReader source =
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java
index da4eb3955..ac01468eb 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/OutputStreamArgumentTypeDescriptor.java
@@ -86,7 +86,7 @@ public class OutputStreamArgumentTypeDescriptor extends ArgumentTypeDescriptor {
     @Override
     public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches )  {
         ArgumentDefinition definition = createDefaultArgumentDefinition(source);
-        String fileName = getArgumentValue( definition, matches );
+        String fileName = getArgumentValue( definition, matches ).asString();
 
         // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object;
         // therefore, the user must have failed to specify a type default
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java
index 83d1b7eb2..f13cb8fa8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileReaderArgumentTypeDescriptor.java
@@ -25,15 +25,11 @@
 package org.broadinstitute.sting.gatk.io.stubs;
 
 import net.sf.samtools.SAMFileReader;
-import org.broadinstitute.sting.commandline.ArgumentMatches;
-import org.broadinstitute.sting.commandline.ArgumentSource;
-import org.broadinstitute.sting.commandline.ArgumentTypeDescriptor;
-import org.broadinstitute.sting.commandline.ParsingEngine;
+import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 import org.broadinstitute.sting.utils.sam.SAMFileReaderBuilder;
 
-import java.io.File;
 import java.lang.reflect.Type;
 
 /**
@@ -47,7 +43,7 @@ public class SAMFileReaderArgumentTypeDescriptor extends ArgumentTypeDescriptor
 
     /**
      * Create a new SAMFileReader argument, notifying the given engine when that argument has been created.
-     * @param engine
+     * @param engine engine
      */
     public SAMFileReaderArgumentTypeDescriptor( GenomeAnalysisEngine engine ) {
         this.engine = engine;
@@ -62,12 +58,12 @@ public class SAMFileReaderArgumentTypeDescriptor extends ArgumentTypeDescriptor
     public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches ) {
         SAMFileReaderBuilder builder = new SAMFileReaderBuilder();
 
-        String readerFileName = getArgumentValue( createDefaultArgumentDefinition(source), matches );
+        ArgumentMatchValue readerFileName = getArgumentValue( createDefaultArgumentDefinition(source), matches );
 
         if( readerFileName == null )
             throw new UserException.CommandLineException("SAM file compression was supplied, but no associated writer was supplied with it.");
 
-        builder.setSAMFile(new File(readerFileName));
+        builder.setSAMFile(readerFileName.asFile());
 
         // WARNING: Skipping required side-effect because stub is impossible to generate.
         engine.addInput(source, builder);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java
index 8566f6c63..00c6ddae8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterArgumentTypeDescriptor.java
@@ -31,7 +31,6 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 
-import java.io.File;
 import java.io.OutputStream;
 import java.lang.annotation.Annotation;
 import java.lang.reflect.Type;
@@ -111,10 +110,10 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
     public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches )  {
         // Extract all possible parameters that could be passed to a BAM file writer?
         ArgumentDefinition bamArgumentDefinition = createBAMArgumentDefinition(source);
-        String writerFileName = getArgumentValue( bamArgumentDefinition, matches );
+        ArgumentMatchValue writerFileName = getArgumentValue( bamArgumentDefinition, matches );
 
-        String compressionLevelText = getArgumentValue( createBAMCompressionArgumentDefinition(source), matches );
-        Integer compressionLevel = compressionLevelText != null ? Integer.valueOf(compressionLevelText) : null;
+        ArgumentMatchValue compressionLevelText = getArgumentValue( createBAMCompressionArgumentDefinition(source), matches );
+        Integer compressionLevel = compressionLevelText != null ? Integer.valueOf(compressionLevelText.asString()) : null;
 
         boolean indexOnTheFly = !argumentIsPresent(disableWriteIndexArgumentDefinition(source),matches);
         boolean generateMD5 = argumentIsPresent(this.enableMD5GenerationArgumentDefinition(source),matches);
@@ -124,32 +123,28 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
 
         // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object;
         // therefore, the user must have failed to specify a type default
-        if(writerFileName == null) {
-            if(!source.isRequired())
-                throw new MissingArgumentValueException(bamArgumentDefinition);
-            if(generateMD5)
+        if(writerFileName != null && writerFileName.asFile() == null && generateMD5)
                 throw new ArgumentException("MD5 generation specified, but no output file specified.  If md5 generation is desired, please specify a BAM output file and an md5 file will be written alongside.");
-        }
 
         // Create the stub and set parameters.
-        SAMFileWriterStub stub;
-        if ( writerFileName != null )
-            stub = new SAMFileWriterStub(engine, new File(writerFileName));
-        else
-            stub = new SAMFileWriterStub(engine, defaultOutputStream);
+        SAMFileWriterStub stub = null;      // stub = new SAMFileWriterStub(engine, defaultOutputStream);
 
-        if ( compressionLevel != null )
-            stub.setCompressionLevel(compressionLevel);
-        if ( indexOnTheFly )
-            stub.setIndexOnTheFly(indexOnTheFly);
-        if ( generateMD5 )
-            stub.setGenerateMD5(generateMD5);
-        if ( simplifyBAM )
-            stub.setSimplifyBAM(simplifyBAM);
+        if (writerFileName != null &&  writerFileName.asFile() != null ) {
+            stub = new SAMFileWriterStub(engine, writerFileName.asFile());
 
-        // WARNING: Side effects required by engine!
-        parsingEngine.addTags(stub,getArgumentTags(matches));
-        engine.addOutput(stub);
+            if ( compressionLevel != null )
+                stub.setCompressionLevel(compressionLevel);
+            if ( indexOnTheFly )
+                stub.setIndexOnTheFly(indexOnTheFly);
+            if ( generateMD5 )
+                stub.setGenerateMD5(generateMD5);
+            if ( simplifyBAM )
+                stub.setSimplifyBAM(simplifyBAM);
+
+            // WARNING: Side effects required by engine!
+            parsingEngine.addTags(stub,getArgumentTags(matches));
+            engine.addOutput(stub);
+        }
 
         return stub;
     }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java
index d8e59a3dd..633fd0e37 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/SAMFileWriterStub.java
@@ -31,12 +31,16 @@ import net.sf.samtools.SAMRecord;
 import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import org.broadinstitute.sting.gatk.io.OutputTracker;
 import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
+import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
 import org.broadinstitute.sting.utils.baq.BAQ;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
 
 import java.io.File;
 import java.io.OutputStream;
+import java.util.ArrayList;
+import java.util.List;
 
 /**
  * A stub for routing and management of SAM file reading and writing.
@@ -116,15 +120,15 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite
      */
     private boolean simplifyBAM = false;
 
+    private List onOutputReadTransformers = null;
+
     /**
      * Create a new stub given the requested SAM file and compression level.
      * @param engine source of header data, maybe other data about input files.
      * @param samFile SAM file to (ultimately) create.
      */
     public SAMFileWriterStub( GenomeAnalysisEngine engine, File samFile ) {
-        this.engine = engine;
-        this.samFile = samFile;
-        this.samOutputStream = null;
+        this(engine, samFile, null);
     }
 
     /**
@@ -133,8 +137,12 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite
      * @param stream Output stream to which data should be written.
      */
     public SAMFileWriterStub( GenomeAnalysisEngine engine, OutputStream stream ) {
+        this(engine, null, stream);
+    }
+
+    private SAMFileWriterStub(final GenomeAnalysisEngine engine, final File samFile, final OutputStream stream) {
         this.engine = engine;
-        this.samFile = null;
+        this.samFile = samFile;
         this.samOutputStream = stream;
     }
 
@@ -142,7 +150,7 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite
      * Retrieves the SAM file to (ultimately) be created.
      * @return The SAM file.  Must not be null.
      */
-    public File getSAMFile() {
+    public File getOutputFile() {
         return samFile;
     }
 
@@ -154,7 +162,7 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite
         simplifyBAM = v;
     }
 
-    public OutputStream getSAMOutputStream() {
+    public OutputStream getOutputStream() {
         return samOutputStream;
     }
 
@@ -212,7 +220,7 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite
 
     /**
      * Gets whether to generate an md5 on-the-fly for this BAM.
-     * @return True generates the md5.  False means skip writing the file.
+     * @param generateMD5   True generates the md5.  False means skip writing the file.
      */
     public void setGenerateMD5(boolean generateMD5) {
         if(writeStarted)
@@ -274,17 +282,29 @@ public class SAMFileWriterStub implements Stub, StingSAMFileWrite
         this.headerOverride = header;
     }
 
+    private void initializeReadTransformers() {
+        this.onOutputReadTransformers = new ArrayList(engine.getReadTransformers().size());
+        for ( final ReadTransformer transformer : engine.getReadTransformers() ) {
+            if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_OUTPUT )
+                onOutputReadTransformers.add(transformer);
+        }
+    }
+
     /**
      * @{inheritDoc}
      */
-    public void addAlignment( SAMRecord alignment ) {
-        if ( engine.getArguments().BAQMode != BAQ.CalculationMode.OFF && engine.getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_OUTPUT ) {
-            //System.out.printf("Writing BAQ at OUTPUT TIME%n");
-            baqHMM.baqRead(alignment, engine.getReferenceDataSource().getReference(), engine.getArguments().BAQMode, engine.getWalkerBAQQualityMode());
-        }
+    public void addAlignment( final SAMRecord readIn ) {
+        if ( onOutputReadTransformers == null )
+            initializeReadTransformers();
+
+        GATKSAMRecord workingRead = (GATKSAMRecord)readIn;
+
+        // run on output read transformers
+        for ( final ReadTransformer transform : onOutputReadTransformers )
+            workingRead = transform.apply(workingRead);
 
         writeStarted = true;
-        outputTracker.getStorage(this).addAlignment(alignment);
+        outputTracker.getStorage(this).addAlignment(workingRead);
     }
 
     /**
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java
index b042144b6..873f5b7c8 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/Stub.java
@@ -27,6 +27,9 @@ package org.broadinstitute.sting.gatk.io.stubs;
 
 import org.broadinstitute.sting.gatk.io.OutputTracker;
 
+import java.io.File;
+import java.io.OutputStream;
+
 /**
  * A stub used for managing IO. Acts as a proxy for IO streams
  * not yet created or streams that need significant external
@@ -43,4 +46,14 @@ public interface Stub {
      * @param outputTracker The connector used to provide an appropriate stream.
      */
     public void register( OutputTracker outputTracker );
+
+    /**
+     * Returns the OutputStream represented by this stub or null if not available.
+     */
+    public OutputStream getOutputStream();
+
+    /**
+     * Returns the File represented by this stub or null if not available.
+     */
+    public File getOutputFile();
 }
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java
index 5e1132d45..f521c959d 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VCFWriterArgumentTypeDescriptor.java
@@ -138,8 +138,8 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
     public Object parse( ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches )  {
         ArgumentDefinition defaultArgumentDefinition = createDefaultArgumentDefinition(source);
         // Get the filename for the genotype file, if it exists.  If not, we'll need to send output to out.
-        String writerFileName = getArgumentValue(defaultArgumentDefinition,matches);
-        File writerFile = writerFileName != null ? new File(writerFileName) : null;
+        ArgumentMatchValue writerFileName = getArgumentValue(defaultArgumentDefinition,matches);
+        File writerFile = writerFileName != null ? writerFileName.asFile() : null;
 
         // This parser has been passed a null filename and the GATK is not responsible for creating a type default for the object;
         // therefore, the user must have failed to specify a type default
@@ -151,7 +151,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
                 ? new VariantContextWriterStub(engine, writerFile, argumentSources)
                 : new VariantContextWriterStub(engine, defaultOutputStream, argumentSources);
 
-        stub.setCompressed(isCompressed(writerFileName));
+        stub.setCompressed(isCompressed(writerFileName == null ? null: writerFileName.asString()));
         stub.setDoNotWriteGenotypes(argumentIsPresent(createSitesOnlyArgumentDefinition(),matches));
         stub.setSkipWritingCommandLineHeader(argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches));
         stub.setForceBCF(argumentIsPresent(createBCFArgumentDefinition(),matches));
diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
index 260a7efda..f92d78bb5 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/io/stubs/VariantContextWriterStub.java
@@ -32,9 +32,9 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
 import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.variantcontext.writer.Options;
 import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
-import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
 
 import java.io.File;
@@ -140,7 +140,7 @@ public class VariantContextWriterStub implements Stub, Var
      * Retrieves the file to (ultimately) be created.
      * @return The file.  Can be null if genotypeStream is not.
      */
-    public File getFile() {
+    public File getOutputFile() {
         return genotypeFile;
     }
 
@@ -148,7 +148,7 @@ public class VariantContextWriterStub implements Stub, Var
      * Retrieves the output stearm to which to (ultimately) write.
      * @return The file.  Can be null if genotypeFile is not.
      */
-    public PrintStream getOutputStream() {
+    public OutputStream getOutputStream() {
         return genotypeStream;
     }
 
@@ -196,7 +196,7 @@ public class VariantContextWriterStub implements Stub, Var
         if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
         if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY);
 
-        if ( forceBCF || (getFile() != null && VariantContextWriterFactory.isBCFOutput(getFile())) )
+        if ( forceBCF || (getOutputFile() != null && VariantContextWriterFactory.isBCFOutput(getOutputFile())) )
             options.add(Options.FORCE_BCF);
 
         return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options);
@@ -269,9 +269,9 @@ public class VariantContextWriterStub implements Stub, Var
      * @return
      */
     public boolean alsoWriteBCFForTest() {
-        return engine.getArguments().numberOfThreads == 1 && // only works single threaded
+        return engine.getArguments().numberOfDataThreads == 1 && // only works single threaded
                 ! isCompressed() && // for non-compressed outputs
-                getFile() != null && // that are going to disk
+                getOutputFile() != null && // that are going to disk
                 engine.getArguments().generateShadowBCF; // and we actually want to do it
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java
similarity index 88%
rename from public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java
rename to public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java
index 835748ff0..c0de06b49 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/DownsampleIterator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LegacyDownsampleIterator.java
@@ -6,13 +6,13 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
 import java.util.Iterator;
 
 
-public class DownsampleIterator implements StingSAMIterator {
+public class LegacyDownsampleIterator implements StingSAMIterator {
 
     StingSAMIterator it;
     int cutoff;
     SAMRecord next;
 
-    public DownsampleIterator(StingSAMIterator it, double fraction) {
+    public LegacyDownsampleIterator(StingSAMIterator it, double fraction) {
         this.it = it;
         cutoff = (int)(fraction * 10000);
         next = getNextRecord();
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
index 64f914064..46e84798a 100755
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByState.java
@@ -31,8 +31,8 @@ import net.sf.samtools.CigarElement;
 import net.sf.samtools.CigarOperator;
 import net.sf.samtools.SAMRecord;
 import org.apache.log4j.Logger;
-import org.broadinstitute.sting.gatk.DownsampleType;
-import org.broadinstitute.sting.gatk.DownsamplingMethod;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
+import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
 import org.broadinstitute.sting.gatk.ReadProperties;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.utils.GenomeLoc;
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java
new file mode 100755
index 000000000..557cbd009
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimental.java
@@ -0,0 +1,649 @@
+/*
+ * Copyright (c) 2009 The Broad Institute
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+package org.broadinstitute.sting.gatk.iterators;
+
+import net.sf.picard.util.PeekableIterator;
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import net.sf.samtools.SAMRecord;
+import org.apache.log4j.Logger;
+import org.broadinstitute.sting.gatk.ReadProperties;
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
+import org.broadinstitute.sting.gatk.downsampling.Downsampler;
+import org.broadinstitute.sting.gatk.downsampling.LevelingDownsampler;
+import org.broadinstitute.sting.utils.GenomeLoc;
+import org.broadinstitute.sting.utils.GenomeLocParser;
+import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+import org.broadinstitute.sting.utils.sam.ReadUtils;
+
+import java.util.*;
+
+/**
+ * Iterator that traverses a SAM File, accumulating information on a per-locus basis
+ */
+public class LocusIteratorByStateExperimental extends LocusIterator {
+    /**
+     * our log, which we want to capture anything from this class
+     */
+    private static Logger logger = Logger.getLogger(LocusIteratorByState.class);
+
+    // -----------------------------------------------------------------------------------------------------------------
+    //
+    // member fields
+    //
+    // -----------------------------------------------------------------------------------------------------------------
+
+    /**
+     * Used to create new GenomeLocs.
+     */
+    private final GenomeLocParser genomeLocParser;
+    private final ArrayList samples;
+    private final ReadStateManager readStates;
+
+    protected static class SAMRecordState {
+        SAMRecord read;
+        int readOffset = -1;                    // how far are we offset from the start of the read bases?
+        int genomeOffset = -1;                  // how far are we offset from the alignment start on the genome?
+
+        Cigar cigar = null;
+        int cigarOffset = -1;
+        CigarElement curElement = null;
+        int nCigarElements = 0;
+
+        int cigarElementCounter = -1;           // how far are we into a single cigarElement
+
+        // The logical model for generating extended events is as follows: the "record state" implements the traversal
+        // along the reference; thus stepForwardOnGenome() returns on every and only on actual reference bases. This
+        // can be a (mis)match or a deletion (in the latter case, we still return on every individual reference base the
+        // deletion spans). In the extended events mode, the record state also remembers if there was an insertion, or
+        // if the deletion just started *right before* the current reference base the record state is pointing to upon the return from
+        // stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
+        // events immediately preceding the current reference base).
+
+        public SAMRecordState(SAMRecord read) {
+            this.read = read;
+            cigar = read.getCigar();
+            nCigarElements = cigar.numCigarElements();
+
+            //System.out.printf("Creating a SAMRecordState: %s%n", this);
+        }
+
+        public SAMRecord getRead() {
+            return read;
+        }
+
+        /**
+         * What is our current offset in the read's bases that aligns us with the reference genome?
+         *
+         * @return
+         */
+        public int getReadOffset() {
+            return readOffset;
+        }
+
+        /**
+         * What is the current offset w.r.t. the alignment state that aligns us to the readOffset?
+         *
+         * @return
+         */
+        public int getGenomeOffset() {
+            return genomeOffset;
+        }
+
+        public int getGenomePosition() {
+            return read.getAlignmentStart() + getGenomeOffset();
+        }
+
+        public GenomeLoc getLocation(GenomeLocParser genomeLocParser) {
+            return genomeLocParser.createGenomeLoc(read.getReferenceName(), getGenomePosition());
+        }
+
+        public CigarOperator getCurrentCigarOperator() {
+            return curElement.getOperator();
+        }
+
+        public String toString() {
+            return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
+        }
+
+        public CigarElement peekForwardOnGenome() {
+            return ( cigarElementCounter + 1 > curElement.getLength() && cigarOffset + 1 < nCigarElements ? cigar.getCigarElement(cigarOffset + 1) : curElement );
+        }
+
+        public CigarElement peekBackwardOnGenome() {
+            return ( cigarElementCounter - 1 == 0 && cigarOffset - 1 > 0 ? cigar.getCigarElement(cigarOffset - 1) : curElement );
+        }
+
+        
+        public CigarOperator stepForwardOnGenome() {
+            // we enter this method with readOffset = index of the last processed base on the read
+            // (-1 if we did not process a single base yet); this can be last matching base, or last base of an insertion
+
+
+            if (curElement == null || ++cigarElementCounter > curElement.getLength()) {
+                cigarOffset++;
+                if (cigarOffset < nCigarElements) {
+                    curElement = cigar.getCigarElement(cigarOffset);
+                    cigarElementCounter = 0;
+                    // next line: guards against cigar elements of length 0; when new cigar element is retrieved,
+                    // we reenter in order to re-check cigarElementCounter against curElement's length
+                    return stepForwardOnGenome();
+                } else {
+                    if (curElement != null && curElement.getOperator() == CigarOperator.D)
+                        throw new UserException.MalformedBAM(read, "read ends with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
+                        
+                    // Reads that contain indels model the genomeOffset as the following base in the reference.  Because
+                    // we fall into this else block only when indels end the read, increment genomeOffset  such that the
+                    // current offset of this read is the next ref base after the end of the indel.  This position will
+                    // model a point on the reference somewhere after the end of the read.
+                    genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
+                    // we do step forward on the ref, and by returning null we also indicate that we are past the read end.
+
+                    return null;
+                }
+            }
+
+            boolean done = false;
+            switch (curElement.getOperator()) {
+                case H: // ignore hard clips
+                case P: // ignore pads
+                    cigarElementCounter = curElement.getLength();
+                    break;
+                case I: // insertion w.r.t. the reference
+                case S: // soft clip
+                    cigarElementCounter = curElement.getLength();
+                    readOffset += curElement.getLength();
+                    break;
+                case D: // deletion w.r.t. the reference
+                    if (readOffset < 0)             // we don't want reads starting with deletion, this is a malformed cigar string
+                        throw new UserException.MalformedBAM(read, "read starts with deletion. Cigar: " + read.getCigarString() + ". Although the SAM spec technically permits such reads, this is often indicative of malformed files. If you are sure you want to use this file, re-run your analysis with the extra option: -rf BadCigar");
+                    // should be the same as N case
+                    genomeOffset++;
+                    done = true;
+                    break;
+                case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning)
+                    genomeOffset++;
+                    done = true;
+                    break;
+                case M:
+                case EQ:
+                case X:
+                    readOffset++;
+                    genomeOffset++;
+                    done = true;
+                    break;
+                default:
+                    throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
+            }
+
+            return done ? curElement.getOperator() : stepForwardOnGenome();
+        }
+    }
+
+    //final boolean DEBUG = false;
+    //final boolean DEBUG2 = false && DEBUG;
+    private ReadProperties readInfo;
+    private AlignmentContext nextAlignmentContext;
+    private boolean performLevelingDownsampling;
+
+    // -----------------------------------------------------------------------------------------------------------------
+    //
+    // constructors and other basic operations
+    //
+    // -----------------------------------------------------------------------------------------------------------------
+
+    public LocusIteratorByStateExperimental(final Iterator samIterator, ReadProperties readInformation, GenomeLocParser genomeLocParser, Collection samples) {
+        this.readInfo = readInformation;
+        this.genomeLocParser = genomeLocParser;
+        this.samples = new ArrayList(samples);
+        this.readStates = new ReadStateManager(samIterator);
+
+        this.performLevelingDownsampling = readInfo.getDownsamplingMethod() != null &&
+                                           readInfo.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE &&
+                                           readInfo.getDownsamplingMethod().toCoverage != null;
+
+        // currently the GATK expects this LocusIteratorByState to accept empty sample lists, when
+        // there's no read data.  So we need to throw this error only when samIterator.hasNext() is true
+        if (this.samples.isEmpty() && samIterator.hasNext()) {
+            throw new IllegalArgumentException("samples list must not be empty");
+        }
+    }
+
+    /**
+     * For testing only.  Assumes that the incoming SAMRecords have no read groups, so creates a dummy sample list
+     * for the system.
+     */
+    public final static Collection sampleListForSAMWithoutReadGroups() {
+        List samples = new ArrayList();
+        samples.add(null);
+        return samples;
+    }
+
+    public Iterator iterator() {
+        return this;
+    }
+
+    public void close() {
+        //this.it.close();
+    }
+
+    public boolean hasNext() {
+        lazyLoadNextAlignmentContext();
+        return (nextAlignmentContext != null);
+        //if ( DEBUG ) System.out.printf("hasNext() = %b%n", r);
+    }
+
+    private GenomeLoc getLocation() {
+        return readStates.isEmpty() ? null : readStates.getFirst().getLocation(genomeLocParser);
+    }
+
+    // -----------------------------------------------------------------------------------------------------------------
+    //
+    // next() routine and associated collection operations
+    //
+    // -----------------------------------------------------------------------------------------------------------------
+    public AlignmentContext next() {
+        lazyLoadNextAlignmentContext();
+        if (!hasNext())
+            throw new NoSuchElementException("LocusIteratorByState: out of elements.");
+        AlignmentContext currentAlignmentContext = nextAlignmentContext;
+        nextAlignmentContext = null;
+        return currentAlignmentContext;
+    }
+
+    /**
+     * Creates the next alignment context from the given state.  Note that this is implemented as a lazy load method.
+     * nextAlignmentContext MUST BE null in order for this method to advance to the next entry.
+     */
+    private void lazyLoadNextAlignmentContext() {
+        while (nextAlignmentContext == null && readStates.hasNext()) {
+            readStates.collectPendingReads();
+
+            final GenomeLoc location = getLocation();
+            final Map fullPileup = new HashMap();
+
+            // TODO: How can you determine here whether the current pileup has been downsampled?
+            boolean hasBeenSampled = false;
+
+            for (final String sample : samples) {
+                final Iterator iterator = readStates.iterator(sample);
+                final List pile = new ArrayList(readStates.size(sample));
+
+                int size = 0;                                                           // number of elements in this sample's pileup
+                int nDeletions = 0;                                                     // number of deletions in this sample's pileup
+                int nMQ0Reads = 0;                                                      // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
+
+                while (iterator.hasNext()) {
+                    final SAMRecordState state = iterator.next();                   // state object with the read/offset information
+                    final GATKSAMRecord read = (GATKSAMRecord) state.getRead();     // the actual read
+                    final CigarOperator op = state.getCurrentCigarOperator();       // current cigar operator
+                    final CigarElement nextElement = state.peekForwardOnGenome();   // next cigar element
+                    final CigarElement lastElement = state.peekBackwardOnGenome();  // last cigar element
+                    final boolean isSingleElementCigar = nextElement == lastElement;
+                    final CigarOperator nextOp = nextElement.getOperator();         // next cigar operator
+                    final CigarOperator lastOp = lastElement.getOperator();         // last cigar operator
+                    int readOffset = state.getReadOffset();                         // the base offset on this read
+
+                    final boolean isBeforeDeletion  = nextOp == CigarOperator.DELETION;
+                    final boolean isAfterDeletion   = lastOp == CigarOperator.DELETION;
+                    final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
+                    final boolean isAfterInsertion  = lastOp == CigarOperator.INSERTION && !isSingleElementCigar;
+                    final boolean isNextToSoftClip  = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
+
+                    int nextElementLength = nextElement.getLength();
+
+                    if (op == CigarOperator.N)                                      // N's are never added to any pileup
+                        continue;
+
+                    if (op == CigarOperator.D) {
+                        // TODO -- LIBS is totally busted for deletions so that reads with Ds right before Is in their CIGAR are broken; must fix
+                        if (readInfo.includeReadsWithDeletionAtLoci()) {            // only add deletions to the pileup if we are authorized to do so
+                            pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
+                            size++;
+                            nDeletions++;
+                            if (read.getMappingQuality() == 0)
+                                nMQ0Reads++;
+                        }
+                    }
+                    else {
+                        if (!filterBaseInRead(read, location.getStart())) {
+                            String insertedBaseString = null;
+                            if (nextOp == CigarOperator.I) {
+                                final int insertionOffset = isSingleElementCigar ? 0 : 1;
+                                // TODO -- someone please implement a better fix for the single element insertion CIGAR!
+                                if (isSingleElementCigar)
+                                    readOffset -= (nextElement.getLength() - 1); // LIBS has passed over the insertion bases!
+                                insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + insertionOffset, readOffset + insertionOffset + nextElement.getLength()));
+                            }
+
+                            pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
+                            size++;
+                            if (read.getMappingQuality() == 0)
+                                nMQ0Reads++;
+                        }
+                    }
+                }
+
+                if (pile.size() != 0)                                             // if this pileup added at least one base, add it to the full pileup
+                    fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
+            }
+
+            updateReadStates();                                                   // critical - must be called after we get the current state offsets and location
+            if (!fullPileup.isEmpty())                                            // if we got reads with non-D/N over the current position, we are done
+                nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
+        }
+    }
+
+    // fast testing of position
+    private boolean readIsPastCurrentPosition(SAMRecord read) {
+        if (readStates.isEmpty())
+            return false;
+        else {
+            SAMRecordState state = readStates.getFirst();
+            SAMRecord ourRead = state.getRead();
+            return read.getReferenceIndex() > ourRead.getReferenceIndex() || read.getAlignmentStart() > state.getGenomePosition();
+        }
+    }
+
+    /**
+     * Generic place to put per-base filters appropriate to LocusIteratorByState
+     *
+     * @param rec
+     * @param pos
+     * @return
+     */
+    private static boolean filterBaseInRead(GATKSAMRecord rec, long pos) {
+        return ReadUtils.isBaseInsideAdaptor(rec, pos);
+    }
+
+    private void updateReadStates() {
+        for (final String sample : samples) {
+            Iterator it = readStates.iterator(sample);
+            while (it.hasNext()) {
+                SAMRecordState state = it.next();
+                CigarOperator op = state.stepForwardOnGenome();
+                if (op == null) {
+                    // we discard the read only when we are past its end AND indel at the end of the read (if any) was
+                    // already processed. Keeping the read state that returned null upon stepForwardOnGenome() is safe
+                    // as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
+                    it.remove();                                                // we've stepped off the end of the object
+                }
+            }
+        }
+    }
+
+    public void remove() {
+        throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
+    }
+
+    protected class ReadStateManager {
+        private final PeekableIterator iterator;
+        private final SamplePartitioner samplePartitioner;
+        private final Map readStatesBySample = new HashMap();
+        private int totalReadStates = 0;
+
+        public ReadStateManager(Iterator source) {
+            this.iterator = new PeekableIterator(source);
+
+            for (final String sample : samples) {
+                readStatesBySample.put(sample, new PerSampleReadStateManager());
+            }
+
+            samplePartitioner = new SamplePartitioner();
+        }
+
+        /**
+         * Returns a iterator over all the reads associated with the given sample.  Note that remove() is implemented
+         * for this iterator; if present, total read states will be decremented.
+         *
+         * @param sample The sample.
+         * @return Iterator over the reads associated with that sample.
+         */
+        public Iterator iterator(final String sample) {
+            return new Iterator() {
+                private Iterator wrappedIterator = readStatesBySample.get(sample).iterator();
+
+                public boolean hasNext() {
+                    return wrappedIterator.hasNext();
+                }
+
+                public SAMRecordState next() {
+                    return wrappedIterator.next();
+                }
+
+                public void remove() {
+                    wrappedIterator.remove();
+                }
+            };
+        }
+
+        public boolean isEmpty() {
+            return totalReadStates == 0;
+        }
+
+        /**
+         * Retrieves the total number of reads in the manager across all samples.
+         *
+         * @return Total number of reads over all samples.
+         */
+        public int size() {
+            return totalReadStates;
+        }
+
+        /**
+         * Retrieves the total number of reads in the manager in the given sample.
+         *
+         * @param sample The sample.
+         * @return Total number of reads in the given sample.
+         */
+        public int size(final String sample) {
+            return readStatesBySample.get(sample).size();
+        }
+
+        public SAMRecordState getFirst() {
+            for (final String sample : samples) {
+                PerSampleReadStateManager reads = readStatesBySample.get(sample);
+                if (!reads.isEmpty())
+                    return reads.peek();
+            }
+            return null;
+        }
+
+        public boolean hasNext() {
+            return totalReadStates > 0 || iterator.hasNext();
+        }
+
+        public void collectPendingReads() {
+            if (!iterator.hasNext())
+                return;
+
+            if (readStates.size() == 0) {
+                int firstContigIndex = iterator.peek().getReferenceIndex();
+                int firstAlignmentStart = iterator.peek().getAlignmentStart();
+                while (iterator.hasNext() && iterator.peek().getReferenceIndex() == firstContigIndex && iterator.peek().getAlignmentStart() == firstAlignmentStart) {
+                    samplePartitioner.submitRead(iterator.next());
+                }
+            } else {
+                // Fast fail in the case that the read is past the current position.
+                if (readIsPastCurrentPosition(iterator.peek()))
+                    return;
+
+                while (iterator.hasNext() && !readIsPastCurrentPosition(iterator.peek())) {
+                    samplePartitioner.submitRead(iterator.next());
+                }
+            }
+
+            for (final String sample : samples) {
+                Collection newReads = samplePartitioner.getReadsForSample(sample);
+                PerSampleReadStateManager statesBySample = readStatesBySample.get(sample);
+                addReadsToSample(statesBySample, newReads);
+            }
+
+            samplePartitioner.reset();
+        }
+
+        /**
+         * Add reads with the given sample name to the given hanger entry.
+         *
+         * @param readStates The list of read states to add this collection of reads.
+         * @param reads      Reads to add.  Selected reads will be pulled from this source.
+         */
+        private void addReadsToSample(final PerSampleReadStateManager readStates, final Collection reads) {
+            if (reads.isEmpty())
+                return;
+
+            Collection newReadStates = new LinkedList();
+
+            for (SAMRecord read : reads) {
+                SAMRecordState state = new SAMRecordState(read);
+                state.stepForwardOnGenome();
+                newReadStates.add(state);
+            }
+
+            readStates.addStatesAtNextAlignmentStart(newReadStates);
+        }
+
+        protected class PerSampleReadStateManager implements Iterable {
+            private List> readStatesByAlignmentStart = new LinkedList>();
+            private int thisSampleReadStates = 0;
+            private Downsampler> levelingDownsampler =
+                      performLevelingDownsampling ?
+                      new LevelingDownsampler, SAMRecordState>(readInfo.getDownsamplingMethod().toCoverage) :
+                      null;
+
+            public void addStatesAtNextAlignmentStart(Collection states) {
+                if ( states.isEmpty() ) {
+                    return;
+                }
+
+                readStatesByAlignmentStart.add(new LinkedList(states));
+                thisSampleReadStates += states.size();
+                totalReadStates += states.size();
+
+                if ( levelingDownsampler != null ) {
+                    levelingDownsampler.submit(readStatesByAlignmentStart);
+                    levelingDownsampler.signalEndOfInput();
+
+                    thisSampleReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
+                    totalReadStates -= levelingDownsampler.getNumberOfDiscardedItems();
+
+                    // use returned List directly rather than make a copy, for efficiency's sake
+                    readStatesByAlignmentStart = levelingDownsampler.consumeFinalizedItems();
+                    levelingDownsampler.reset();
+                }
+            }
+
+            public boolean isEmpty() {
+                return readStatesByAlignmentStart.isEmpty();
+            }
+
+            public SAMRecordState peek() {
+                return isEmpty() ? null : readStatesByAlignmentStart.get(0).peek();
+            }
+
+            public int size() {
+                return thisSampleReadStates;
+            }
+
+            public Iterator iterator() {
+                return new Iterator() {
+                    private Iterator> alignmentStartIterator = readStatesByAlignmentStart.iterator();
+                    private LinkedList currentPositionReadStates = null;
+                    private Iterator currentPositionReadStatesIterator = null;
+
+                    public boolean hasNext() {
+                        return  alignmentStartIterator.hasNext() ||
+                                (currentPositionReadStatesIterator != null && currentPositionReadStatesIterator.hasNext());
+                    }
+
+                    public SAMRecordState next() {
+                        if ( currentPositionReadStatesIterator == null || ! currentPositionReadStatesIterator.hasNext() ) {
+                            currentPositionReadStates = alignmentStartIterator.next();
+                            currentPositionReadStatesIterator = currentPositionReadStates.iterator();
+                        }
+
+                        return currentPositionReadStatesIterator.next();
+                    }
+
+                    public void remove() {
+                        currentPositionReadStatesIterator.remove();
+                        thisSampleReadStates--;
+                        totalReadStates--;
+
+                        if ( currentPositionReadStates.isEmpty() ) {
+                            alignmentStartIterator.remove();
+                        }
+                    }
+                };
+            }
+        }
+    }
+
+    /**
+     * Note: stores reads by sample ID string, not by sample object
+     */
+    private class SamplePartitioner {
+        private Map> readsBySample;
+        private long readsSeen = 0;
+
+        public SamplePartitioner() {
+            readsBySample = new HashMap>();
+
+            for ( String sample : samples ) {
+                readsBySample.put(sample, new ArrayList());
+            }
+        }
+
+        public void submitRead(SAMRecord read) {
+            String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
+            if (readsBySample.containsKey(sampleName))
+                readsBySample.get(sampleName).add(read);
+            readsSeen++;
+        }
+
+        public long getNumReadsSeen() {
+            return readsSeen;
+        }
+
+        public Collection getReadsForSample(String sampleName) {
+            if ( ! readsBySample.containsKey(sampleName) )
+                throw new NoSuchElementException("Sample name not found");
+            return readsBySample.get(sampleName);
+        }
+
+        public void reset() {
+            for ( Collection perSampleReads : readsBySample.values() )
+                perSampleReads.clear();
+            readsSeen = 0;
+        }
+    }
+}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java
new file mode 100644
index 000000000..28348ecc2
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformer.java
@@ -0,0 +1,144 @@
+package org.broadinstitute.sting.gatk.iterators;
+
+import com.google.java.contract.Ensures;
+import com.google.java.contract.Requires;
+import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
+import org.broadinstitute.sting.gatk.walkers.Walker;
+import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
+
+/**
+ * Baseclass used to describe a read transformer like BAQ and BQSR
+ *
+ * Read transformers are plugable infrastructure that modify read state
+ * either on input, on output, or within walkers themselves.
+ *
+ * The function apply() is called on each read seen by the GATK (after passing
+ * all ReadFilters) and it can do as it sees fit (without modifying the alignment)
+ * to the read to change qualities, add tags, etc.
+ *
+ * Initialize is called once right before the GATK traversal begins providing
+ * the ReadTransformer with the ability to collect and initialize data from the
+ * engine.
+ *
+ * Note that all ReadTransformers within the classpath are created and initialized.  If one
+ * shouldn't be run it should look at the command line options of the engine and override
+ * the enabled.
+ *
+ * @since 8/31/12
+ * @author depristo
+ */
+abstract public class ReadTransformer {
+    /**
+     * When should this read transform be applied?
+     */
+    private ApplicationTime applicationTime;
+
+    /**
+     * Keep track of whether we've been initialized already, and ensure it's not called more than once.
+     */
+    private boolean initialized = false;
+
+    protected ReadTransformer() {}
+
+    /**
+     * Master initialization routine.  Called to setup a ReadTransform, using it's overloaded initialialSub routine.
+     *
+     * @param overrideTime if not null, we will run this ReadTransform at the time provided, regardless of the timing of this read transformer itself
+     * @param engine the engine, for initializing values
+     * @param walker the walker we intend to run
+     */
+    @Requires({"initialized == false", "engine != null", "walker != null"})
+    @Ensures("initialized == true")
+    public final void initialize(final ApplicationTime overrideTime, final GenomeAnalysisEngine engine, final Walker walker) {
+        if ( engine == null ) throw new IllegalArgumentException("engine cannot be null");
+        if ( walker == null ) throw new IllegalArgumentException("walker cannot be null");
+
+        this.applicationTime = initializeSub(engine, walker);
+        if ( overrideTime != null ) this.applicationTime = overrideTime;
+        initialized = true;
+    }
+
+    /**
+     * Subclasses must override this to initialize themeselves
+     *
+     * @param engine the engine, for initializing values
+     * @param walker the walker we intend to run
+     * @return the point of time we'd like this read transform to be run
+     */
+    @Requires({"engine != null", "walker != null"})
+    @Ensures("result != null")
+    protected abstract ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker);
+
+    /**
+     * Should this ReadTransformer be activated?  Called after initialize, which allows this
+     * read transformer to look at its arguments and decide if it should be active.  All
+     * ReadTransformers must override this, as by default they are not enabled.
+     *
+     * @return true if this ReadTransformer should be used on the read stream
+     */
+    public boolean enabled() {
+        return false;
+    }
+
+    /**
+     * Has this transformer been initialized?
+     *
+     * @return true if it has
+     */
+    public final boolean isInitialized() {
+        return initialized;
+    }
+
+    /**
+     * When should we apply this read transformer?
+     *
+     * @return true if yes
+     */
+    public final ApplicationTime getApplicationTime() {
+        return applicationTime;
+    }
+
+    /**
+     * Primary interface function for a read transform to actually do some work
+     *
+     * The function apply() is called on each read seen by the GATK (after passing
+     * all ReadFilters) and it can do as it sees fit (without modifying the alignment)
+     * to the read to change qualities, add tags, etc.
+     *
+     * @param read the read to transform
+     * @return the transformed read
+     */
+    @Requires("read != null")
+    @Ensures("result != null")
+    abstract public GATKSAMRecord apply(final GATKSAMRecord read);
+
+    @Override
+    public String toString() {
+        return getClass().getSimpleName();
+    }
+
+    /**
+     * When should a read transformer be applied?
+     */
+    public static enum ApplicationTime {
+        /**
+         * Walker does not tolerate this read transformer
+         */
+        FORBIDDEN,
+
+        /**
+         * apply the transformation to the incoming reads, the default
+         */
+        ON_INPUT,
+
+        /**
+         * apply the transformation to the outgoing read stream
+         */
+        ON_OUTPUT,
+
+        /**
+         * the walker will deal with the calculation itself
+         */
+        HANDLED_IN_WALKER
+    }
+}
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java
new file mode 100644
index 000000000..be227619f
--- /dev/null
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/ReadTransformersMode.java
@@ -0,0 +1,28 @@
+package org.broadinstitute.sting.gatk.iterators;
+
+import java.lang.annotation.*;
+
+/**
+ * User: hanna
+ * Date: May 14, 2009
+ * Time: 1:51:22 PM
+ * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT
+ * Software and documentation are copyright 2005 by the Broad Institute.
+ * All rights are reserved.
+ *
+ * Users acknowledge that this software is supplied without any warranty or support.
+ * The Broad Institute is not responsible for its use, misuse, or
+ * functionality.
+ */
+
+/**
+ * Allows the walker to indicate what type of data it wants to consume.
+ */
+
+@Documented
+@Inherited
+@Retention(RetentionPolicy.RUNTIME)
+@Target(ElementType.TYPE)
+public @interface ReadTransformersMode {
+    public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT;
+}
\ No newline at end of file
diff --git a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java
index f33dd414b..9578bba56 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/iterators/VerifyingSamIterator.java
@@ -1,7 +1,6 @@
 package org.broadinstitute.sting.gatk.iterators;
 
 import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.GenomeLocParser;
 import org.broadinstitute.sting.utils.exceptions.UserException;
 
@@ -11,13 +10,11 @@ import java.util.Iterator;
  * Verifies that the incoming stream of reads is correctly sorted
  */
 public class VerifyingSamIterator implements StingSAMIterator {
-    private GenomeLocParser genomeLocParser;
     StingSAMIterator it;
     SAMRecord last = null;
     boolean checkOrderP = true;
 
-    public VerifyingSamIterator(GenomeLocParser genomeLocParser,StingSAMIterator it) {
-        this.genomeLocParser = genomeLocParser;
+    public VerifyingSamIterator(StingSAMIterator it) {
         this.it = it;
     }
 
@@ -48,9 +45,9 @@ public class VerifyingSamIterator implements StingSAMIterator {
             if(cur.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX || cur.getAlignmentStart() == SAMRecord.NO_ALIGNMENT_START)
                 throw new UserException.MalformedBAM(last,String.format("read %s has inconsistent mapping information.",cur.format()));
 
-            GenomeLoc lastLoc = genomeLocParser.createGenomeLoc( last );
-            GenomeLoc curLoc = genomeLocParser.createGenomeLoc( cur );
-            return curLoc.compareTo(lastLoc) == -1;
+            return (last.getReferenceIndex() > cur.getReferenceIndex()) ||
+                    (last.getReferenceIndex().equals(cur.getReferenceIndex()) &&
+                            last.getAlignmentStart() > cur.getAlignmentStart());
         }
     }
 
diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
index b60a7845a..51fed470f 100644
--- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java
@@ -32,6 +32,7 @@ import org.broadinstitute.sting.gatk.walkers.Walker;
 import org.broadinstitute.sting.utils.Utils;
 import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
 import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
 import org.jets3t.service.S3Service;
 import org.jets3t.service.S3ServiceException;
 import org.jets3t.service.impl.rest.httpclient.RestS3Service;
@@ -138,6 +139,24 @@ public class GATKRunReport {
     @Element(required = true, name = "iterations")
     private long nIterations;
 
+    @Element(required = true, name = "tag")
+    private String tag;
+
+    // -----------------------------------------------------------------
+    // elements related to multi-threading and efficiency
+    // -----------------------------------------------------------------
+
+    @Element(required = true, name = "numThreads")
+    private int numThreads;
+    @Element(required = true, name = "percent_time_running")
+    private String percentTimeRunning;
+    @Element(required = true, name = "percent_time_waiting")
+    private String percentTimeWaiting;
+    @Element(required = true, name = "percent_time_blocking")
+    private String percentTimeBlocking;
+    @Element(required = true, name = "percent_time_waiting_for_io")
+    private String percentTimeWaitingForIO;
+
     public enum PhoneHomeOption {
         /** Disable phone home */
         NO_ET,
@@ -186,6 +205,8 @@ public class GATKRunReport {
             nIterations = engine.getCumulativeMetrics().getNumIterations();
         }
 
+        tag = engine.getArguments().tag;
+
         // user and hostname -- information about the runner of the GATK
         userName = System.getProperty("user.name");
         hostName = Utils.resolveHostname();
@@ -196,12 +217,30 @@ public class GATKRunReport {
 
         // if there was an exception, capture it
         this.mException = e == null ? null : new ExceptionToXML(e);
+
+        numThreads = engine.getTotalNumberOfThreads();
+        percentTimeRunning = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.USER_CPU);
+        percentTimeBlocking = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.BLOCKING);
+        percentTimeWaiting = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING);
+        percentTimeWaitingForIO = getThreadEfficiencyPercent(engine, ThreadEfficiencyMonitor.State.WAITING_FOR_IO);
     }
 
     public String getID() {
         return id;
     }
 
+    /**
+     * Return a string representing the percent of time the GATK spent in state, if possible.  Otherwise return NA
+     *
+     * @param engine the GATK engine whose threading efficiency info we will use
+     * @param state the state whose occupancy we wish to know
+     * @return a string representation of the percent occupancy of state, or NA is not possible
+     */
+    private String getThreadEfficiencyPercent(final GenomeAnalysisEngine engine, final ThreadEfficiencyMonitor.State state) {
+        final ThreadEfficiencyMonitor tem = engine.getThreadEfficiencyMonitor();
+        return tem == null ? "NA" : String.format("%.2f", tem.getStatePercent(state));
+    }
+
 
     public void postReport(PhoneHomeOption type) {
         logger.debug("Posting report of type " + type);
diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java
deleted file mode 100644
index 96dbd15f2..000000000
--- a/public/java/src/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTracker.java
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2010.  The Broad Institute
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-package org.broadinstitute.sting.gatk.refdata;
-
-import net.sf.samtools.SAMRecord;
-import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer;
-import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
-import org.broadinstitute.sting.utils.GenomeLoc;
-import org.broadinstitute.sting.utils.GenomeLocParser;
-
-import java.util.*;
-
-
-/**
- * @author aaron
- *         

- * Class ReadMetaDataTracker - *

- * a read-based meta data tracker - */ -public class ReadMetaDataTracker { - /** - * The parser, used to create new GenomeLocs. - */ - private final GenomeLocParser genomeLocParser; - - private final SAMRecord record; - - // the buffer of positions and RODs we've stored - private final TreeMap mapping; - - /** - * create a read meta data tracker, given the read and a queue of RODatum positions - * - * @param record the read to create offset from - * @param mapping the mapping of reference ordered datum - */ - public ReadMetaDataTracker(GenomeLocParser genomeLocParser, SAMRecord record, TreeMap mapping) { - this.genomeLocParser = genomeLocParser; - this.record = record; - this.mapping = mapping; - } - - /** - * create an alignment of read position to reference ordered datum - * - * @param record the SAMRecord - * @param queue the queue (as a tree set) - * @param cl the class name, null if not filtered by classname - * @param name the datum track name, null if not filtered by name - * - * @return a mapping from the position in the read to the reference ordered datum - */ - private Map> createReadAlignment(SAMRecord record, TreeMap queue, Class cl, String name) { - if (name != null && cl != null) throw new IllegalStateException("Both a class and name cannot be specified"); - Map> ret = new LinkedHashMap>(); - GenomeLoc location = genomeLocParser.createGenomeLoc(record); - int length = record.getReadLength(); - for (Integer loc : queue.keySet()) { - Integer position = loc - location.getStart(); - if (position >= 0 && position < length) { - Collection set; - if (cl != null) - set = queue.get(loc).getSet(cl); - else - set = queue.get(loc).getSet(name); - if (set != null && set.size() > 0) - ret.put(position, set); - } - } - return ret; - - } - - /** - * create an alignment of read position to reference ordered datum - * - * @return a mapping from the position in the read to the reference ordered datum - */ - private Map> createGenomeLocAlignment(SAMRecord record, TreeMap mapping, Class cl, String name) { - Map> ret = new LinkedHashMap>(); - int start = record.getAlignmentStart(); - int stop = record.getAlignmentEnd(); - for (Integer location : mapping.keySet()) { - if (location >= start && location <= stop) - if (cl != null) - ret.put(location, mapping.get(location).getSet(cl)); - else - ret.put(location, mapping.get(location).getSet(name)); - } - return ret; - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of read offset to ROD(s) - */ - public Map> getReadOffsetMapping() { - return createReadAlignment(record, mapping, null, null); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of genome loc position to ROD(s) - */ - public Map> getContigOffsetMapping() { - return createGenomeLocAlignment(record, mapping, null, null); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of read offset to ROD(s) - */ - public Map> getReadOffsetMapping(String name) { - return createReadAlignment(record, mapping, null, name); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of genome loc position to ROD(s) - */ - public Map> getContigOffsetMapping(String name) { - return createGenomeLocAlignment(record, mapping, null, name); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of read offset to ROD(s) - */ - public Map> getReadOffsetMapping(Class cl) { - return createReadAlignment(record, mapping, cl, null); - } - - /** - * get the position mapping, from read offset to ROD - * - * @return a mapping of genome loc position to ROD(s) - */ - public Map> getContigOffsetMapping(Class cl) { - return createGenomeLocAlignment(record, mapping, cl, null); - } - - /** - * get the list of all the RODS overlapping this read, without any information about their position - * @return a Collection (no order guaranteed), of all the RODs covering this read - */ - public List getAllCoveringRods() { - List ret = new ArrayList(); - for (Map.Entry entry : mapping.entrySet()) - ret.addAll(entry.getValue().getSet()); - return ret; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java index 2c2ee51bb..7e32ec112 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/RefMetaDataTracker.java @@ -5,7 +5,6 @@ import com.google.java.contract.Requires; import org.apache.log4j.Logger; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.RodBinding; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; @@ -32,11 +31,10 @@ import java.util.*; * Time: 3:05:23 PM */ public class RefMetaDataTracker { - // TODO: this should be a list, not a map, actually + // TODO: this should be a list, not a bindings, actually private final static RODRecordList EMPTY_ROD_RECORD_LIST = new RODRecordListImpl("EMPTY"); - final Map map; - final ReferenceContext ref; + final Map bindings; final protected static Logger logger = Logger.getLogger(RefMetaDataTracker.class); // ------------------------------------------------------------------------------------------ @@ -48,28 +46,25 @@ public class RefMetaDataTracker { // ------------------------------------------------------------------------------------------ /** - * Only for testing -- not accesssible in any other context + * Create an tracker with no bindings */ public RefMetaDataTracker() { - ref = null; - map = Collections.emptyMap(); + bindings = Collections.emptyMap(); } - public RefMetaDataTracker(final Collection allBindings, final ReferenceContext ref) { - this.ref = ref; - - // set up the map + public RefMetaDataTracker(final Collection allBindings) { + // set up the bindings if ( allBindings.isEmpty() ) - map = Collections.emptyMap(); + bindings = Collections.emptyMap(); else { - Map tmap = new HashMap(allBindings.size()); + final Map tmap = new HashMap(allBindings.size()); for ( RODRecordList rod : allBindings ) { if ( rod != null && ! rod.isEmpty() ) tmap.put(canonicalName(rod.getName()), rod); } - // ensure that no one modifies the map itself - map = Collections.unmodifiableMap(tmap); + // ensure that no one modifies the bindings itself + bindings = Collections.unmodifiableMap(tmap); } } @@ -99,7 +94,7 @@ public class RefMetaDataTracker { @Requires({"type != null"}) @Ensures("result != null") public List getValues(final Class type) { - return addValues(map.keySet(), type, new ArrayList(), null, false, false); + return addValues(bindings.keySet(), type, new ArrayList(), null, false, false); } /** @@ -114,7 +109,7 @@ public class RefMetaDataTracker { @Requires({"type != null", "onlyAtThisLoc != null"}) @Ensures("result != null") public List getValues(final Class type, final GenomeLoc onlyAtThisLoc) { - return addValues(map.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); + return addValues(bindings.keySet(), type, new ArrayList(), onlyAtThisLoc, true, false); } /** @@ -296,7 +291,7 @@ public class RefMetaDataTracker { */ @Requires({"rodBinding != null"}) public boolean hasValues(final RodBinding rodBinding) { - return map.containsKey(canonicalName(rodBinding.getName())); + return bindings.containsKey(canonicalName(rodBinding.getName())); } /** @@ -306,7 +301,7 @@ public class RefMetaDataTracker { * @return List of all tracks */ public List getBoundRodTracks() { - return new ArrayList(map.values()); + return new ArrayList(bindings.values()); } /** @@ -314,38 +309,30 @@ public class RefMetaDataTracker { * @return the number of tracks with at least one bound Feature */ public int getNTracksWithBoundFeatures() { - return map.size(); + return bindings.size(); } // ------------------------------------------------------------------------------------------ - // - // - // old style accessors - // - // TODO -- DELETE ME - // - // + // Protected accessors using strings for unit testing // ------------------------------------------------------------------------------------------ - @Deprecated - public boolean hasValues(final String name) { - return map.containsKey(canonicalName(name)); + protected boolean hasValues(final String name) { + return bindings.containsKey(canonicalName(name)); } - @Deprecated - public List getValues(final Class type, final String name) { + protected List getValues(final Class type, final String name) { return addValues(name, type, new ArrayList(), getTrackDataByName(name), null, false, false); } - @Deprecated - public List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + + protected List getValues(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { return addValues(name, type, new ArrayList(), getTrackDataByName(name), onlyAtThisLoc, true, false); } - @Deprecated - public T getFirstValue(final Class type, final String name) { + + protected T getFirstValue(final Class type, final String name) { return safeGetFirst(getValues(type, name)); } - @Deprecated - public T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { + + protected T getFirstValue(final Class type, final String name, final GenomeLoc onlyAtThisLoc) { return safeGetFirst(getValues(type, name, onlyAtThisLoc)); } @@ -366,7 +353,7 @@ public class RefMetaDataTracker { * @return */ @Requires({"l != null"}) - final private T safeGetFirst(final List l) { + private T safeGetFirst(final List l) { return l.isEmpty() ? null : l.get(0); } @@ -435,7 +422,7 @@ public class RefMetaDataTracker { */ private RODRecordList getTrackDataByName(final String name) { final String luName = canonicalName(name); - RODRecordList l = map.get(luName); + RODRecordList l = bindings.get(luName); return l == null ? EMPTY_ROD_RECORD_LIST : l; } @@ -448,7 +435,7 @@ public class RefMetaDataTracker { * @param name the name of the rod * @return canonical name of the rod */ - private final String canonicalName(final String name) { + private String canonicalName(final String name) { // todo -- remove me after switch to RodBinding syntax return name.toLowerCase(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java index 2b46414a8..5c7da82d0 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java +++ b/public/java/src/org/broadinstitute/sting/gatk/refdata/VariantContextAdaptors.java @@ -364,7 +364,7 @@ public class VariantContextAdaptors { long end = hapmap.getEnd(); if ( deletionLength > 0 ) - end += deletionLength; + end += (deletionLength - 1); VariantContext vc = new VariantContextBuilder(name, hapmap.getChr(), hapmap.getStart(), end, alleles).id(hapmap.getName()).genotypes(genotypes).make(); return vc; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java index 0c81af07b..c86f06c25 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/resourcemanagement/ThreadAllocation.java @@ -24,7 +24,7 @@ package org.broadinstitute.sting.gatk.resourcemanagement; -import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; /** * Models how threads are distributed between various components of the GATK. @@ -33,61 +33,83 @@ public class ThreadAllocation { /** * The number of CPU threads to be used by the GATK. */ - private final int numCPUThreads; + private final int numDataThreads; + + /** + * The number of CPU threads per data thread for GATK processing + */ + private final int numCPUThreadsPerDataThread; /** * Number of threads to devote exclusively to IO. Default is 0. */ private final int numIOThreads; - public int getNumCPUThreads() { - return numCPUThreads; + /** + * Should we monitor thread efficiency? + */ + private final boolean monitorEfficiency; + + public int getNumDataThreads() { + return numDataThreads; + } + + public int getNumCPUThreadsPerDataThread() { + return numCPUThreadsPerDataThread; } public int getNumIOThreads() { return numIOThreads; } + public boolean monitorThreadEfficiency() { + return monitorEfficiency; + } + + /** + * Are we running in parallel mode? + * + * @return true if any parallel processing is enabled + */ + public boolean isRunningInParallelMode() { + return getTotalNumThreads() > 1; + } + + /** + * What is the total number of threads in use by the GATK? + * + * @return the sum of all thread allocations in this object + */ + public int getTotalNumThreads() { + return getNumDataThreads() * getNumCPUThreadsPerDataThread() + getNumIOThreads(); + } + /** * Construct the default thread allocation. */ public ThreadAllocation() { - this(1,null,null); + this(1, 1, 0, false); } /** * Set up the thread allocation. Default allocation is 1 CPU thread, 0 IO threads. * (0 IO threads means that no threads are devoted exclusively to IO; they're inline on the CPU thread). - * @param totalThreads Complete number of threads to allocate. - * @param numCPUThreads Total number of threads allocated to the traversal. + * @param numDataThreads Total number of threads allocated to the traversal. + * @param numCPUThreadsPerDataThread The number of CPU threads per data thread to allocate * @param numIOThreads Total number of threads allocated exclusively to IO. + * @param monitorEfficiency should we monitor threading efficiency in the GATK? */ - public ThreadAllocation(final int totalThreads, final Integer numCPUThreads, final Integer numIOThreads) { - // If no allocation information is present, allocate all threads to CPU - if(numCPUThreads == null && numIOThreads == null) { - this.numCPUThreads = totalThreads; - this.numIOThreads = 0; - } - // If only CPU threads are specified, allocate remainder to IO (minimum 0 dedicated IO threads). - else if(numIOThreads == null) { - if(numCPUThreads > totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) is higher than the total threads",totalThreads,numCPUThreads)); - this.numCPUThreads = numCPUThreads; - this.numIOThreads = totalThreads - numCPUThreads; - } - // If only IO threads are specified, allocate remainder to CPU (minimum 1 dedicated CPU thread). - else if(numCPUThreads == null) { - if(numIOThreads > totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of io threads (%d) is higher than the total threads",totalThreads,numIOThreads)); - this.numCPUThreads = Math.max(1,totalThreads-numIOThreads); - this.numIOThreads = numIOThreads; - } - else { - if(numCPUThreads + numIOThreads != totalThreads) - throw new UserException(String.format("Invalid thread allocation. User requested %d threads in total, but the count of cpu threads (%d) + the count of io threads (%d) does not match",totalThreads,numCPUThreads,numIOThreads)); - this.numCPUThreads = numCPUThreads; - this.numIOThreads = numIOThreads; - } - } + public ThreadAllocation(final int numDataThreads, + final int numCPUThreadsPerDataThread, + final int numIOThreads, + final boolean monitorEfficiency) { + if ( numDataThreads < 1 ) throw new ReviewedStingException("numDataThreads cannot be less than 1, but saw " + numDataThreads); + if ( numCPUThreadsPerDataThread < 1 ) throw new ReviewedStingException("numCPUThreadsPerDataThread cannot be less than 1, but saw " + numCPUThreadsPerDataThread); + if ( numIOThreads < 0 ) throw new ReviewedStingException("numIOThreads cannot be less than 0, but saw " + numIOThreads); + this.numDataThreads = numDataThreads; + this.numCPUThreadsPerDataThread = numCPUThreadsPerDataThread; + this.numIOThreads = numIOThreads; + this.monitorEfficiency = monitorEfficiency; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java index 31149cd8a..3de85028f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDB.java @@ -168,13 +168,70 @@ public class SampleDB { return families; } + /** + * Returns all the trios present in the sample database. The strictOneChild parameter determines + * whether multiple children of the same parents resolve to multiple trios, or are excluded + * @param strictOneChild - exclude pedigrees with >1 child for parental pair + * @return - all of the mother+father=child triplets, subject to strictOneChild + */ + public final Set getTrios(boolean strictOneChild) { + Set trioSet = new HashSet(); + for ( String familyString : getFamilyIDs() ) { + Set family = getFamily(familyString); + for ( Sample sample : family) { + if ( sample.getParents().size() == 2 ) { + Trio trio = new Trio(sample.getMother(),sample.getFather(),sample); + trioSet.add(trio); + } + } + } + + if ( strictOneChild ) + trioSet = removeTriosWithSameParents(trioSet); + + return trioSet; + } + + /** + * Returns all the trios present in the db. See getTrios(boolean strictOneChild) + * @return all the trios present in the samples db. + */ + public final Set getTrios() { + return getTrios(false); + } + + /** + * Subsets a set of trios to only those with nonmatching founders. If two (or more) trio objects have + * the same mother and father, then both (all) are removed from the returned set. + * @param trios - a set of Trio objects + * @return those subset of Trio objects in the input set with nonmatching founders + */ + private Set removeTriosWithSameParents(final Set trios) { + Set filteredTrios = new HashSet(); + filteredTrios.addAll(trios); + Set triosWithSameParents = new HashSet(); + for ( Trio referenceTrio : filteredTrios ) { + for ( Trio compareTrio : filteredTrios ) { + if ( referenceTrio != compareTrio && + referenceTrio.getFather().equals(compareTrio.getFather()) && + referenceTrio.getMother().equals(compareTrio.getMother()) ) { + triosWithSameParents.add(referenceTrio); + triosWithSameParents.add(compareTrio); + } + } + } + filteredTrios.removeAll(triosWithSameParents); + return filteredTrios; + } /** * Returns the set of all children that have both of their parents. * Note that if a family is composed of more than 1 child, each child is * returned. * @return - all the children that have both of their parents + * @deprecated - getTrios() replaces this function */ + @Deprecated public final Set getChildrenWithParents(){ return getChildrenWithParents(false); } @@ -188,7 +245,15 @@ public class SampleDB { * * @param triosOnly - if set to true, only strict trios are returned * @return - all the children that have both of their parents + * @deprecated - getTrios(boolean strict) replaces this function + * @bug -- does not work for extracting multiple generations of trios, e.g. + * ..........Mom1------Dad1 + * ................| + * ..............Child1--------Mom2 + * .......................| + * .....................Child2 */ + @Deprecated public final Set getChildrenWithParents(boolean triosOnly) { Map> families = getFamilies(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java index 44a8600b0..612e342db 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/SampleDBBuilder.java @@ -135,9 +135,8 @@ public class SampleDBBuilder { // -------------------------------------------------------------------------------- protected final void validate() { - if ( validationStrictness == PedigreeValidationType.SILENT ) - return; - else { + validatePedigreeIDUniqueness(); + if ( validationStrictness != PedigreeValidationType.SILENT ) { // check that samples in data sources are all annotated, if anything is annotated if ( ! samplesFromPedigrees.isEmpty() && ! samplesFromDataSources.isEmpty() ) { final Set sampleNamesFromPedigrees = new HashSet(); @@ -150,4 +149,12 @@ public class SampleDBBuilder { } } } + + private void validatePedigreeIDUniqueness() { + Set pedigreeIDs = new HashSet(); + for ( Sample sample : samplesFromPedigrees ) { + pedigreeIDs.add(sample.getID()); + } + assert pedigreeIDs.size() == samplesFromPedigrees.size() : "The number of sample IDs extracted from the pedigree does not equal the number of samples in the pedigree. Is a sample associated with multiple families?"; + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java b/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java new file mode 100644 index 000000000..314baad3d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/samples/Trio.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.gatk.samples; + +/** + * A class for imposing a trio structure on three samples; a common paradigm + * + * todo -- there should probably be an interface or abstract class "Pedigree" that generalizes the notion of + * -- imposing structure on samples. But given how complex pedigrees can quickly become, it's not + * -- clear the best way to do this. + */ +public class Trio { + private Sample mother; + private Sample father; + private Sample child; + + public Trio(Sample mom, Sample dad, Sample spawn) { + assert mom.getID().equals(spawn.getMaternalID()) && dad.getID().equals(spawn.getPaternalID()) : "Samples passed to trio constructor do not form a trio"; + mother = mom; + father = dad; + child = spawn; + } + + public Sample getMother() { + return mother; + } + + public String getMaternalID() { + return mother.getID(); + } + + public Sample getFather() { + return father; + } + + public String getPaternalID() { + return father.getID(); + } + + public Sample getChild() { + return child; + } + + public String getChildID() { + return child.getID(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java index abc71e549..668bddcca 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraversalEngine.java @@ -30,82 +30,29 @@ import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.exceptions.UserException; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileOutputStream; -import java.io.PrintStream; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; +import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; public abstract class TraversalEngine,ProviderType extends ShardDataProvider> { - // Time in milliseconds since we initialized this engine - private static final int HISTORY_WINDOW_SIZE = 50; - - private static class ProcessingHistory { - double elapsedSeconds; - long unitsProcessed; - long bpProcessed; - GenomeLoc loc; - - public ProcessingHistory(double elapsedSeconds, GenomeLoc loc, long unitsProcessed, long bpProcessed) { - this.elapsedSeconds = elapsedSeconds; - this.loc = loc; - this.unitsProcessed = unitsProcessed; - this.bpProcessed = bpProcessed; - } - - } - - /** lock object to sure updates to history are consistent across threads */ - private static final Object lock = new Object(); - LinkedList history = new LinkedList(); - - /** We use the SimpleTimer to time our run */ - private SimpleTimer timer = null; - - // How long can we go without printing some progress info? - private static final int PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES = 1000; - private int printProgressCheckCounter = 0; - private long lastProgressPrintTime = -1; // When was the last time we printed progress log? - private long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; // in milliseconds - private long PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds - private final double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; - private final double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; - private boolean progressMeterInitialized = false; - - // for performance log - private static final boolean PERFORMANCE_LOG_ENABLED = true; - private final Object performanceLogLock = new Object(); - private File performanceLogFile; - private PrintStream performanceLog = null; - private long lastPerformanceLogPrintTime = -1; // When was the last time we printed to the performance log? - private final long PERFORMANCE_LOG_PRINT_FREQUENCY = PROGRESS_PRINT_FREQUENCY; // in milliseconds - - /** Size, in bp, of the area we are processing. Updated once in the system in initial for performance reasons */ - long targetSize = -1; - GenomeLocSortedSet targetIntervals = null; - /** our log, which we want to capture anything from this class */ protected static final Logger logger = Logger.getLogger(TraversalEngine.class); protected GenomeAnalysisEngine engine; + private ProgressMeter progressMeter; // ---------------------------------------------------------------------------------------------------- // // ABSTRACT METHODS // // ---------------------------------------------------------------------------------------------------- + /** - * Gets the named traversal type associated with the given traversal. + * Gets the named traversal type associated with the given traversal, such as loci, reads, etc. + * * @return A user-friendly name for the given traversal type. */ - protected abstract String getTraversalType(); + public abstract String getTraversalUnits(); /** * this method must be implemented by all traversal engines @@ -120,264 +67,67 @@ public abstract class TraversalEngine,Provide ProviderType dataProvider, T sum); - // ---------------------------------------------------------------------------------------------------- - // - // Common timing routines - // - // ---------------------------------------------------------------------------------------------------- /** * Initialize the traversal engine. After this point traversals can be run over the data + * * @param engine GenomeAnalysisEngine for this traversal + * @param progressMeter An optional (null == optional) meter to track our progress */ - public void initialize(GenomeAnalysisEngine engine) { + public void initialize(final GenomeAnalysisEngine engine, final ProgressMeter progressMeter) { if ( engine == null ) throw new ReviewedStingException("BUG: GenomeAnalysisEngine cannot be null!"); this.engine = engine; - - if ( PERFORMANCE_LOG_ENABLED && engine.getArguments() != null && engine.getArguments().performanceLog != null ) { - synchronized(this.performanceLogLock) { - performanceLogFile = engine.getArguments().performanceLog; - createNewPerformanceLog(); - } - } - - // if we don't have any intervals defined, create intervals from the reference itself - if ( this.engine.getIntervals() == null ) - targetIntervals = GenomeLocSortedSet.createSetFromSequenceDictionary(engine.getReferenceDataSource().getReference().getSequenceDictionary()); - else - targetIntervals = this.engine.getIntervals(); - targetSize = targetIntervals.coveredSize(); - } - - private void createNewPerformanceLog() { - synchronized(performanceLogLock) { - try { - performanceLog = new PrintStream(new FileOutputStream(engine.getArguments().performanceLog)); - List pLogHeader = Arrays.asList("elapsed.time", "units.processed", "processing.speed", "bp.processed", "bp.speed", "genome.fraction.complete", "est.total.runtime", "est.time.remaining"); - performanceLog.println(Utils.join("\t", pLogHeader)); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(engine.getArguments().performanceLog, e); - } - } - } - /** - * Should be called to indicate that we're going to process records and the timer should start ticking. This - * function should be called right before any traversal work is done, to avoid counting setup costs in the - * processing costs and inflating the estimated runtime. - */ - public void startTimersIfNecessary() { - if ( timer == null ) { - timer = new SimpleTimer("Traversal"); - timer.start(); - lastProgressPrintTime = timer.currentTime(); - } + this.progressMeter = progressMeter; } /** - * @param curTime (current runtime, in millisecs) - * @param lastPrintTime the last time we printed, in machine milliseconds - * @param printFreq maximum permitted difference between last print and current times + * For testing only. Does not initialize the progress meter * - * @return true if the maximum interval (in millisecs) has passed since the last printing + * @param engine */ - private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) { - long elapsed = curTime - lastPrintTime; - return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; + protected void initialize(final GenomeAnalysisEngine engine) { + initialize(engine, null); } /** - * Forward request to printProgress + * Called by the MicroScheduler when all work is done and the GATK is shutting down. + * + * To be used by subclasses that need to free up resources (such as threads) + */ + public void shutdown() { + // by default there's nothing to do + } + + /** + * Update the cumulative traversal metrics according to the data in this shard + * + * @param shard a non-null shard + */ + public void updateCumulativeMetrics(final Shard shard) { + updateCumulativeMetrics(shard.getReadMetrics()); + } + + /** + * Update the cumulative traversal metrics according to the data in this shard + * + * @param singleTraverseMetrics read metrics object containing the information about a single shard's worth + * of data processing + */ + public void updateCumulativeMetrics(final ReadMetrics singleTraverseMetrics) { + engine.getCumulativeMetrics().incrementMetrics(singleTraverseMetrics); + } + + /** + * Forward request to notifyOfProgress + * + * Assumes that one cycle has been completed * - * @param shard the given shard currently being processed. * @param loc the location */ - public void printProgress(Shard shard, GenomeLoc loc) { - // A bypass is inserted here for unit testing. - printProgress(loc,shard.getReadMetrics(),false); - } - - /** - * Utility routine that prints out process information (including timing) every N records or - * every M seconds, for N and M set in global variables. - * - * @param loc Current location, can be null if you are at the end of the traversal - * @param metrics Data processed since the last cumulative - * @param mustPrint If true, will print out info, regardless of nRecords or time interval - */ - private void printProgress(GenomeLoc loc, ReadMetrics metrics, boolean mustPrint) { - if ( mustPrint || printProgressCheckCounter++ % PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES != 0 ) - // don't do any work more often than PRINT_PROGRESS_CHECK_FREQUENCY_IN_CYCLES - return; - - if(!progressMeterInitialized && mustPrint == false ) { - logger.info("[INITIALIZATION COMPLETE; TRAVERSAL STARTING]"); - logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", - "Location", getTraversalType(), getTraversalType())); - progressMeterInitialized = true; - } - - final long curTime = timer.currentTime(); - boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, PROGRESS_PRINT_FREQUENCY); - boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); - - if ( printProgress || printLog ) { - // getting and appending metrics data actually turns out to be quite a heavyweight - // operation. Postpone it until after determining whether to print the log message. - ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics() != null ? engine.getCumulativeMetrics() : new ReadMetrics(); - if(metrics != null) - cumulativeMetrics.incrementMetrics(metrics); - - final long nRecords = cumulativeMetrics.getNumIterations(); - - ProcessingHistory last = updateHistory(loc,cumulativeMetrics); - - final AutoFormattingTime elapsed = new AutoFormattingTime(last.elapsedSeconds); - final AutoFormattingTime bpRate = new AutoFormattingTime(secondsPerMillionBP(last)); - final AutoFormattingTime unitRate = new AutoFormattingTime(secondsPerMillionElements(last)); - final double fractionGenomeTargetCompleted = calculateFractionGenomeTargetCompleted(last); - final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted); - final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds()); - - if ( printProgress ) { - lastProgressPrintTime = curTime; - - // dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates - if ( estTotalRuntime.getTimeInSeconds() > TWELVE_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 60 * 1000; // in milliseconds - else if ( estTotalRuntime.getTimeInSeconds() > TWO_HOURS_IN_SECONDS ) - PROGRESS_PRINT_FREQUENCY = 30 * 1000; // in milliseconds - else - PROGRESS_PRINT_FREQUENCY = 10 * 1000; // in milliseconds - - logger.info(String.format("%15s %5.2e %s %s %4.1f%% %s %s", - loc == null ? "done with mapped reads" : loc, nRecords*1.0, elapsed, unitRate, - 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); - - } - - if ( printLog ) { - lastPerformanceLogPrintTime = curTime; - synchronized(performanceLogLock) { - performanceLog.printf("%.2f\t%d\t%.2e\t%d\t%.2e\t%.2e\t%.2f\t%.2f%n", - elapsed.getTimeInSeconds(), nRecords, unitRate.getTimeInSeconds(), last.bpProcessed, - bpRate.getTimeInSeconds(), fractionGenomeTargetCompleted, estTotalRuntime.getTimeInSeconds(), - timeToCompletion.getTimeInSeconds()); - } - } - } - } - - /** - * Keeps track of the last HISTORY_WINDOW_SIZE data points for the progress meter. Currently the - * history isn't used in any way, but in the future it'll become valuable for more accurate estimates - * for when a process will complete. - * - * @param loc our current position. If null, assumes we are done traversing - * @param metrics information about what's been processed already - * @return - */ - private final ProcessingHistory updateHistory(GenomeLoc loc, ReadMetrics metrics) { - synchronized (lock) { - if ( history.size() > HISTORY_WINDOW_SIZE ) - history.pop(); - - long nRecords = metrics.getNumIterations(); - long bpProcessed = loc == null ? targetSize : targetIntervals.sizeBeforeLoc(loc); // null -> end of processing - history.add(new ProcessingHistory(timer.getElapsedTime(), loc, nRecords, bpProcessed)); - - return history.getLast(); - } - } - - /** How long in seconds to process 1M traversal units? */ - private final double secondsPerMillionElements(ProcessingHistory last) { - return (last.elapsedSeconds * 1000000.0) / Math.max(last.unitsProcessed, 1); - } - - /** How long in seconds to process 1M bp on the genome? */ - private final double secondsPerMillionBP(ProcessingHistory last) { - return (last.elapsedSeconds * 1000000.0) / Math.max(last.bpProcessed, 1); - } - - /** What fractoin of the target intervals have we covered? */ - private final double calculateFractionGenomeTargetCompleted(ProcessingHistory last) { - return (1.0*last.bpProcessed) / targetSize; - } - - /** - * Called after a traversal to print out information about the traversal process - */ - public void printOnTraversalDone() { - printProgress(null, null, true); - - final double elapsed = timer == null ? 0 : timer.getElapsedTime(); - - ReadMetrics cumulativeMetrics = engine.getCumulativeMetrics(); - - // count up the number of skipped reads by summing over all filters - long nSkippedReads = 0L; - for ( final long countsByFilter : cumulativeMetrics.getCountsByFilter().values()) - nSkippedReads += countsByFilter; - - logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", elapsed, elapsed / 60, elapsed / 3600)); - if ( cumulativeMetrics.getNumReadsSeen() > 0 ) - logger.info(String.format("%d reads were filtered out during traversal out of %d total (%.2f%%)", - nSkippedReads, - cumulativeMetrics.getNumReadsSeen(), - 100.0 * MathUtils.ratio(nSkippedReads,cumulativeMetrics.getNumReadsSeen()))); - for ( Map.Entry filterCounts : cumulativeMetrics.getCountsByFilter().entrySet() ) { - long count = filterCounts.getValue(); - logger.info(String.format(" -> %d reads (%.2f%% of total) failing %s", - count, 100.0 * MathUtils.ratio(count,cumulativeMetrics.getNumReadsSeen()), filterCounts.getKey())); - } - - if ( performanceLog != null ) performanceLog.close(); - } - - /** - * Gets the filename to which performance data is currently being written. - * @return Filename to which performance data is currently being written. - */ - public String getPerformanceLogFileName() { - synchronized(performanceLogLock) { - return performanceLogFile.getAbsolutePath(); - } - } - - /** - * Sets the filename of the log for performance. If set, will write performance data. - * @param fileName filename to use when writing performance data. - */ - public void setPerformanceLogFileName(String fileName) { - File file = new File(fileName); - - synchronized(performanceLogLock) { - // Ignore multiple calls to reset the same lock. - if(performanceLogFile != null && performanceLogFile.equals(file)) - return; - - // Close an existing log - if(performanceLog != null) performanceLog.close(); - - performanceLogFile = file; - createNewPerformanceLog(); - } - } - - /** - * Gets the frequency with which performance data is written. - * @return Frequency, in seconds, of performance log writes. - */ - public long getPerformanceProgressPrintFrequencySeconds() { - return PROGRESS_PRINT_FREQUENCY; - } - - /** - * How often should the performance log message be written? - * @param seconds number of seconds between messages indicating performance frequency. - */ - public void setPerformanceProgressPrintFrequencySeconds(long seconds) { - PROGRESS_PRINT_FREQUENCY = seconds; + public void printProgress(final GenomeLoc loc) { + if ( progressMeter != null ) + progressMeter.notifyOfProgress(loc, engine.getCumulativeMetrics().getNumIterations()); } } + diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 979e0f2d6..5d38df0f5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -31,11 +32,11 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); + private final LinkedList workQueue = new LinkedList(); private final LinkedHashSet myReads = new LinkedHashSet(); @Override - protected String getTraversalType() { + public String getTraversalUnits() { return "active regions"; } @@ -103,25 +104,27 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); + final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); // add active regions to queue of regions to process // first check if can merge active regions over shard boundaries if( !activeRegions.isEmpty() ) { if( !workQueue.isEmpty() ) { - final org.broadinstitute.sting.utils.activeregion.ActiveRegion last = workQueue.getLast(); - final org.broadinstitute.sting.utils.activeregion.ActiveRegion first = activeRegions.get(0); + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { workQueue.removeLast(); activeRegions.remove(first); - workQueue.add( new org.broadinstitute.sting.utils.activeregion.ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); } } workQueue.addAll( activeRegions ); @@ -184,7 +187,7 @@ public class TraverseActiveRegions extends TraversalEngine walker ) { // Just want to output the active regions to a file, not actually process them - for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion : workQueue ) { + for( final ActiveRegion activeRegion : workQueue ) { if( activeRegion.isActive ) { walker.activeRegionOutStream.println( activeRegion.getLocation() ); } @@ -197,7 +200,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { + private T processActiveRegion( final ActiveRegion activeRegion, final LinkedHashSet reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { final ArrayList placedReads = new ArrayList(); for( final GATKSAMRecord read : reads ) { final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); if( activeRegion.getLocation().overlapsP( readLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - org.broadinstitute.sting.utils.activeregion.ActiveRegion bestRegion = activeRegion; - for( final org.broadinstitute.sting.utils.activeregion.ActiveRegion otherRegionToTest : workQueue ) { + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); bestRegion = otherRegionToTest; @@ -228,7 +231,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); final M x = walker.map( activeRegion, null ); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 3f349d86d..2e43ef8f8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -54,7 +54,7 @@ public class TraverseDuplicates extends TraversalEngine extends TraversalEngine extends TraversalEngine,LocusShardDataProvider> { - /** - * our log, which we want to capture anything from this class - */ - protected static final Logger logger = Logger.getLogger(TraversalEngine.class); - - @Override - protected String getTraversalType() { - return "sites"; - } - - @Override - public T traverse( LocusWalker walker, - LocusShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); - - LocusView locusView = getLocusView( walker, dataProvider ); - boolean done = false; - - if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all - - //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); - ReferenceOrderedView referenceOrderedDataView = null; - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); - else - referenceOrderedDataView = (RodLocusView)locusView; - - LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); - - // We keep processing while the next reference location is within the interval - while( locusView.hasNext() && ! done ) { - AlignmentContext locus = locusView.next(); - GenomeLoc location = locus.getLocation(); - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - final boolean keepMeP = walker.filter(tracker, refContext, locus); - if (keepMeP) { - M x = walker.map(tracker, refContext, locus); - sum = walker.reduce(x, sum); - done = walker.isDone(); - } - - printProgress(dataProvider.getShard(),locus.getLocation()); - } - } - - // We have a final map call to execute here to clean up the skipped based from the - // last position in the ROD to that in the interval - if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { - // only do this if the walker isn't done! - RodLocusView rodLocusView = (RodLocusView)locusView; - long nSkipped = rodLocusView.getLastSkippedBases(); - if ( nSkipped > 0 ) { - GenomeLoc site = rodLocusView.getLocOneBeyondShard(); - AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); - M x = walker.map(null, null, ac); - sum = walker.reduce(x, sum); - } - } - - return sum; - } - - /** - * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' - * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype - * that comes along. - * @param walker walker to interrogate. - * @param dataProvider Data which which to drive the locus view. - * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. - */ - private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { - DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java new file mode 100755 index 000000000..84715e5fd --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseLociNano.java @@ -0,0 +1,280 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSProgressFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; + +import java.util.Iterator; + +/** + * A simple solution to iterating over all reference positions over a series of genomic locations. + */ +public class TraverseLociNano extends TraversalEngine,LocusShardDataProvider> { + /** our log, which we want to capture anything from this class */ + private static final boolean DEBUG = false; + + final NanoScheduler nanoScheduler; + + public TraverseLociNano(int nThreads) { + nanoScheduler = new NanoScheduler(nThreads); + nanoScheduler.setProgressFunction(new TraverseLociProgress()); + } + + @Override + public final String getTraversalUnits() { + return "sites"; + } + + protected static class TraverseResults { + final int numIterations; + final T reduceResult; + + public TraverseResults(int numIterations, T reduceResult) { + this.numIterations = numIterations; + this.reduceResult = reduceResult; + } + } + + @Override + public T traverse( LocusWalker walker, + LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseLoci.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = getLocusView( walker, dataProvider ); + + if ( locusView.hasNext() ) { // trivial optimization to avoid unnecessary processing when there's nothing here at all + //ReferenceOrderedView referenceOrderedDataView = new ReferenceOrderedView( dataProvider ); + ReferenceOrderedView referenceOrderedDataView = null; + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + referenceOrderedDataView = new ManagingReferenceOrderedView( dataProvider ); + else + referenceOrderedDataView = (RodLocusView)locusView; + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + + final TraverseResults result = traverse( walker, locusView, referenceView, referenceOrderedDataView, sum ); + sum = result.reduceResult; + dataProvider.getShard().getReadMetrics().incrementNumIterations(result.numIterations); + updateCumulativeMetrics(dataProvider.getShard()); + } + + // We have a final map call to execute here to clean up the skipped based from the + // last position in the ROD to that in the interval + if ( WalkerManager.getWalkerDataSource(walker) == DataSource.REFERENCE_ORDERED_DATA && ! walker.isDone() ) { + // only do this if the walker isn't done! + final RodLocusView rodLocusView = (RodLocusView)locusView; + final long nSkipped = rodLocusView.getLastSkippedBases(); + if ( nSkipped > 0 ) { + final GenomeLoc site = rodLocusView.getLocOneBeyondShard(); + final AlignmentContext ac = new AlignmentContext(site, new ReadBackedPileupImpl(site), nSkipped); + final M x = walker.map(null, null, ac); + sum = walker.reduce(x, sum); + } + } + + return sum; + } + + /** + * Gets the best view of loci for this walker given the available data. The view will function as a 'trigger track' + * of sorts, providing a consistent interface so that TraverseLoci doesn't need to be reimplemented for any new datatype + * that comes along. + * @param walker walker to interrogate. + * @param dataProvider Data which which to drive the locus view. + * @return A view of the locus data, where one iteration of the locus view maps to one iteration of the traversal. + */ + private LocusView getLocusView( Walker walker, LocusShardDataProvider dataProvider ) { + final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); + if( dataSource == DataSource.READS ) + return new CoveredLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) + return new AllLocusView(dataProvider); + else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) + return new RodLocusView(dataProvider); + else + throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); + } + + protected TraverseResults traverse(final LocusWalker walker, + final LocusView locusView, + final LocusReferenceView referenceView, + final ReferenceOrderedView referenceOrderedDataView, + final T sum) { + nanoScheduler.setDebug(DEBUG); + final TraverseLociMap myMap = new TraverseLociMap(walker); + final TraverseLociReduce myReduce = new TraverseLociReduce(walker); + + final MapDataIterator inputIterator = new MapDataIterator(locusView, referenceView, referenceOrderedDataView); + final T result = nanoScheduler.execute(inputIterator, myMap, sum, myReduce); + + return new TraverseResults(inputIterator.numIterations, result); + } + + /** + * Create iterator that provides inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + */ + private class MapDataIterator implements Iterator { + final LocusView locusView; + final LocusReferenceView referenceView; + final ReferenceOrderedView referenceOrderedDataView; + int numIterations = 0; + + private MapDataIterator(LocusView locusView, LocusReferenceView referenceView, ReferenceOrderedView referenceOrderedDataView) { + this.locusView = locusView; + this.referenceView = referenceView; + this.referenceOrderedDataView = referenceOrderedDataView; + } + + @Override + public boolean hasNext() { + return locusView.hasNext(); + } + + @Override + public MapData next() { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + //logger.info("Pulling data from MapDataIterator at " + location); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(location, refContext); + + numIterations++; + return new MapData(locus, refContext, tracker); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Cannot remove elements from MapDataIterator"); + } + } + + @Override + public void shutdown() { + nanoScheduler.shutdown(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final AlignmentContext alignmentContext; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(final AlignmentContext alignmentContext, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.alignmentContext = alignmentContext; + this.refContext = refContext; + this.tracker = tracker; + } + + @Override + public String toString() { + return "MapData " + alignmentContext.getLocation(); + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseLociMap implements NSMapFunction { + final LocusWalker walker; + + private TraverseLociMap(LocusWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.tracker, data.refContext, data.alignmentContext); + if (keepMeP) { + final M x = walker.map(data.tracker, data.refContext, data.alignmentContext); + return new MapResult(x); + } + } + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseLociReduce implements NSReduceFunction { + final LocusWalker walker; + + private TraverseLociReduce(LocusWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } + + private class TraverseLociProgress implements NSProgressFunction { + @Override + public void progress(MapData lastProcessedMap) { + if (lastProcessedMap.alignmentContext != null) + printProgress(lastProcessedMap.alignmentContext.getLocation()); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java index ebaac40af..8273e1328 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadPairs.java @@ -27,7 +27,7 @@ public class TraverseReadPairs extends TraversalEngine extends TraversalEngine walker, ReadShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider)); + logger.debug(String.format("TraverseReadsPairs.traverse Covered dataset is %s", dataProvider)); if( !dataProvider.hasReads() ) throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); @@ -65,7 +65,8 @@ public class TraverseReadPairs extends TraversalEngine - * Class TraverseReads - *

- * This class handles traversing by reads in the new shardable style - */ -public class TraverseReads extends TraversalEngine,ReadShardDataProvider> { - /** our log, which we want to capture anything from this class */ - protected static final Logger logger = Logger.getLogger(TraverseReads.class); - - @Override - protected String getTraversalType() { - return "reads"; - } - - /** - * Traverse by reads, given the data and the walker - * - * @param walker the walker to traverse with - * @param dataProvider the provider of the reads data - * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function - * @return the reduce variable of the read walker - */ - public T traverse(ReadWalker walker, - ReadShardDataProvider dataProvider, - T sum) { - - logger.debug(String.format("TraverseReads.traverse Covered dataset is %s", dataProvider)); - - if( !dataProvider.hasReads() ) - throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); - - ReadView reads = new ReadView(dataProvider); - ReadReferenceView reference = new ReadReferenceView(dataProvider); - - // get the reference ordered data - ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); - - boolean done = walker.isDone(); - // while we still have more reads - for (SAMRecord read : reads) { - if ( done ) break; - // ReferenceContext -- the reference bases covered by the read - ReferenceContext refContext = null; - - // get the array of characters for the reference sequence, since we're a mapped read - if (!read.getReadUnmappedFlag() && dataProvider.hasReference()) - refContext = reference.getReferenceContext(read); - - // update the number of reads we've seen - ReadMetrics readMetrics = dataProvider.getShard().getReadMetrics(); - readMetrics.incrementNumIterations(); - - // if the read is mapped, create a metadata tracker - ReadMetaDataTracker tracker = (read.getReferenceIndex() >= 0) ? rodView.getReferenceOrderedDataForRead(read) : null; - - final boolean keepMeP = walker.filter(refContext, (GATKSAMRecord) read); - if (keepMeP) { - M x = walker.map(refContext, (GATKSAMRecord) read, tracker); // the tracker can be null - sum = walker.reduce(x, sum); - } - - GenomeLoc locus = read.getReferenceIndex() == SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX ? null : engine.getGenomeLocParser().createGenomeLoc(read.getReferenceName(),read.getAlignmentStart()); - printProgress(dataProvider.getShard(),locus); - done = walker.isDone(); - } - return sum; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java new file mode 100755 index 000000000..735f62ca3 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.samtools.SAMRecord; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.ReadBasedReferenceOrderedView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadReferenceView; +import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ReadView; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.nanoScheduler.NSMapFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NSReduceFunction; +import org.broadinstitute.sting.utils.nanoScheduler.NanoScheduler; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.LinkedList; +import java.util.List; + +/** + * A nano-scheduling version of TraverseReads. + * + * Implements the traversal of a walker that accepts individual reads, the reference, and + * RODs per map call. Directly supports shared memory parallelism via NanoScheduler + * + * @author depristo + * @version 1.0 + * @date 9/2/2012 + */ +public class TraverseReadsNano extends TraversalEngine,ReadShardDataProvider> { + /** our log, which we want to capture anything from this class */ + protected static final Logger logger = Logger.getLogger(TraverseReadsNano.class); + private static final boolean DEBUG = false; + final NanoScheduler nanoScheduler; + + public TraverseReadsNano(int nThreads) { + nanoScheduler = new NanoScheduler(nThreads); + } + + @Override + public String getTraversalUnits() { + return "reads"; + } + + /** + * Traverse by reads, given the data and the walker + * + * @param walker the walker to traverse with + * @param dataProvider the provider of the reads data + * @param sum the value of type T, specified by the walker, to feed to the walkers reduce function + * @return the reduce variable of the read walker + */ + public T traverse(ReadWalker walker, + ReadShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseReadsNano.traverse Covered dataset is %s", dataProvider)); + + if( !dataProvider.hasReads() ) + throw new IllegalArgumentException("Unable to traverse reads; no read data is available."); + + nanoScheduler.setDebug(DEBUG); + final TraverseReadsMap myMap = new TraverseReadsMap(walker); + final TraverseReadsReduce myReduce = new TraverseReadsReduce(walker); + + final List aggregatedInputs = aggregateMapData(dataProvider); + final T result = nanoScheduler.execute(aggregatedInputs.iterator(), myMap, sum, myReduce); + + final GATKSAMRecord lastRead = aggregatedInputs.get(aggregatedInputs.size() - 1).read; + final GenomeLoc locus = engine.getGenomeLocParser().createGenomeLoc(lastRead); + + updateCumulativeMetrics(dataProvider.getShard()); + printProgress(locus); + + return result; + } + + /** + * Aggregate all of the inputs for all map calls into MapData, to be provided + * to NanoScheduler for Map/Reduce + * + * @param dataProvider the source of our data + * @return a linked list of MapData objects holding the read, ref, and ROD info for every map/reduce + * should execute + */ + private List aggregateMapData(final ReadShardDataProvider dataProvider) { + final ReadView reads = new ReadView(dataProvider); + final ReadReferenceView reference = new ReadReferenceView(dataProvider); + final ReadBasedReferenceOrderedView rodView = new ReadBasedReferenceOrderedView(dataProvider); + + final List mapData = new LinkedList(); + for ( final SAMRecord read : reads ) { + final ReferenceContext refContext = ! read.getReadUnmappedFlag() + ? reference.getReferenceContext(read) + : null; + + // if the read is mapped, create a metadata tracker + final RefMetaDataTracker tracker = read.getReferenceIndex() >= 0 + ? rodView.getReferenceOrderedDataForRead(read) + : null; + + // update the number of reads we've seen + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + mapData.add(new MapData((GATKSAMRecord)read, refContext, tracker)); + } + + return mapData; + } + + @Override + public void shutdown() { + nanoScheduler.shutdown(); + } + + /** + * The input data needed for each map call. The read, the reference, and the RODs + */ + private class MapData { + final GATKSAMRecord read; + final ReferenceContext refContext; + final RefMetaDataTracker tracker; + + private MapData(GATKSAMRecord read, ReferenceContext refContext, RefMetaDataTracker tracker) { + this.read = read; + this.refContext = refContext; + this.tracker = tracker; + } + } + + /** + * Contains the results of a map call, indicating whether the call was good, filtered, or done + */ + private class MapResult { + final M value; + final boolean reduceMe; + + /** + * Create a MapResult with value that should be reduced + * + * @param value the value to reduce + */ + private MapResult(final M value) { + this.value = value; + this.reduceMe = true; + } + + /** + * Create a MapResult that shouldn't be reduced + */ + private MapResult() { + this.value = null; + this.reduceMe = false; + } + } + + /** + * A static object that tells reduce that the result of map should be skipped (filtered or done) + */ + private final MapResult SKIP_REDUCE = new MapResult(); + + /** + * MapFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Applies walker.map to MapData, returning a MapResult object containing the result + */ + private class TraverseReadsMap implements NSMapFunction { + final ReadWalker walker; + + private TraverseReadsMap(ReadWalker walker) { + this.walker = walker; + } + + @Override + public MapResult apply(final MapData data) { + if ( ! walker.isDone() ) { + final boolean keepMeP = walker.filter(data.refContext, data.read); + if (keepMeP) + return new MapResult(walker.map(data.refContext, data.read, data.tracker)); + } + + return SKIP_REDUCE; + } + } + + /** + * NSReduceFunction for TraverseReads meeting NanoScheduler interface requirements + * + * Takes a MapResult object and applies the walkers reduce function to each map result, when applicable + */ + private class TraverseReadsReduce implements NSReduceFunction { + final ReadWalker walker; + + private TraverseReadsReduce(ReadWalker walker) { + this.walker = walker; + } + + @Override + public T apply(MapResult one, T sum) { + if ( one.reduceMe ) + // only run reduce on values that aren't DONE or FAILED + return walker.reduce(one.value, sum); + else + return sum; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java index cbe791353..fed2c995e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ActiveRegionWalker.java @@ -12,6 +12,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; @@ -77,7 +78,7 @@ public abstract class ActiveRegionWalker extends Walker { +public class FlagStat extends ReadWalker implements NanoSchedulable { @Output PrintStream out; // what comes out of the flagstat - static class FlagStatus { + public final static class FlagStatus { long readCount = 0L; long QC_failure = 0L; long duplicates = 0L; @@ -117,62 +117,84 @@ public class FlagStat extends ReadWalker { return builder.toString(); } - } + public FlagStatus add(final FlagStatus other) { + readCount += other.readCount; + QC_failure += other.QC_failure; + duplicates += other.duplicates; + mapped += other.mapped; + paired_in_sequencing += other.paired_in_sequencing; + read1 += other.read1; + read2 += other.read2; + properly_paired += other.properly_paired; + with_itself_and_mate_mapped += other.with_itself_and_mate_mapped; + singletons += other.singletons; + with_mate_mapped_to_a_different_chr += other.with_mate_mapped_to_a_different_chr; + with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5 += other.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5; - - private FlagStatus myStat = new FlagStatus(); - - public Integer map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { - myStat.readCount++; - if (read.getReadFailsVendorQualityCheckFlag()) { - myStat.QC_failure++; + return this; } - if (read.getDuplicateReadFlag()) { - myStat.duplicates++; - } - if (!read.getReadUnmappedFlag()) { - myStat.mapped++; - } - if (read.getReadPairedFlag()) { - myStat.paired_in_sequencing++; - if (read.getSecondOfPairFlag()) { - myStat.read2++; - } else if (read.getReadPairedFlag()) { - myStat.read1++; + public FlagStatus add(final GATKSAMRecord read) { + this.readCount++; + + if (read.getReadFailsVendorQualityCheckFlag()) { + this.QC_failure++; } - if (read.getProperPairFlag()) { - myStat.properly_paired++; + if (read.getDuplicateReadFlag()) { + this.duplicates++; } - if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) { - myStat.with_itself_and_mate_mapped++; + if (!read.getReadUnmappedFlag()) { + this.mapped++; + } + if (read.getReadPairedFlag()) { + this.paired_in_sequencing++; - if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) { - myStat.with_mate_mapped_to_a_different_chr++; + if (read.getSecondOfPairFlag()) { + this.read2++; + } else if (read.getReadPairedFlag()) { + this.read1++; + } + if (read.getProperPairFlag()) { + this.properly_paired++; + } + if (!read.getReadUnmappedFlag() && !read.getMateUnmappedFlag()) { + this.with_itself_and_mate_mapped++; - if (read.getMappingQuality() >= 5) { - myStat.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++; + if (!read.getReferenceIndex().equals(read.getMateReferenceIndex())) { + this.with_mate_mapped_to_a_different_chr++; + + if (read.getMappingQuality() >= 5) { + this.with_mate_mapped_to_a_different_chr_maq_greaterequal_than_5++; + } } } + if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) { + this.singletons++; + } } - if (!read.getReadUnmappedFlag() && read.getMateUnmappedFlag()) { - myStat.singletons++; - } + + return this; } - return 1; - } - public Integer reduceInit() { - return 0; + + @Override + public FlagStatus map( final ReferenceContext ref, final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) { + return new FlagStatus().add(read); + } + + @Override + public FlagStatus reduceInit() { + return new FlagStatus(); } - public Integer reduce(Integer value, Integer sum) { - return value + sum; + @Override + public FlagStatus reduce(final FlagStatus value, final FlagStatus sum) { + return sum.add(value); } - public void onTraversalDone(Integer result) { - //out.println("[REDUCE RESULT] Traversal result is: " + result); - out.println(myStat.toString()); + @Override + public void onTraversalDone(final FlagStatus result) { + out.println(result.toString()); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java new file mode 100755 index 000000000..731ce7e4e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/NanoSchedulable.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2010. The Broad Institute + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.gatk.walkers; + +/** + * Root parallelism interface. Walkers that implement this + * declare that their map function is thread-safe and so multiple + * map calls can be run in parallel in the same JVM instance. + */ +public interface NanoSchedulable { +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java index 2a6ecdb8c..a3efea9f1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/Pileup.java @@ -45,25 +45,14 @@ import java.util.Collections; import java.util.List; /** - * Prints the alignment in the pileup format. In the pileup format, each line represents a genomic position, - * consisting of chromosome name, coordinate, reference base, read bases, read qualities and alignment mapping - * qualities. Information on match, mismatch, indel, strand, mapping quality and start and end of a read are all - * encoded at the read base column. At this column, a dot stands for a match to the reference base on the forward strand, - * a comma for a match on the reverse strand, 'ACGTN' for a mismatch on the forward strand and 'acgtn' for a mismatch on the - * reverse strand. - * - * A pattern '\+[0-9]+[ACGTNacgtn]+' indicates there is an insertion between this reference position and the next - * reference position. The length of the insertion is given by the integer in the pattern, followed by the inserted sequence. - * Similarly, a pattern '-[0-9]+[ACGTNacgtn]+' represents a deletion from the reference. - * Also at the read base column, a symbol '^' marks the start of a read segment which is a contiguous subsequence on the read - * separated by 'N/S/H' CIGAR operations. The ASCII of the character following '^' minus 33 gives the mapping quality. - * A symbol '$' marks the end of a read segment. + * Prints the alignment in something similar to the samtools pileup format. Each line represents a genomic position, + * consisting of chromosome name, coordinate, reference base, read bases, and read qualities. * * Associated command: * samtools pileup [-f in.ref.fasta] [-t in.ref_list] [-l in.site_list] [-iscg] [-T theta] [-N nHap] [-r pairDiffRate] */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class Pileup extends LocusWalker implements TreeReducible { +public class Pileup extends LocusWalker implements TreeReducible, NanoSchedulable { private static final String verboseDelimiter = "@"; // it's ugly to use "@" but it's literally the only usable character not allowed in read names @@ -81,27 +70,32 @@ public class Pileup extends LocusWalker implements TreeReducib @Input(fullName="metadata",shortName="metadata",doc="Add these ROD bindings to the output Pileup", required=false) public List> rods = Collections.emptyList(); - public void initialize() { - } - - public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - - String rods = getReferenceOrderedData( tracker ); + @Override + public String map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + final String rods = getReferenceOrderedData( tracker ); ReadBackedPileup basePileup = context.getBasePileup(); - out.printf("%s %s", basePileup.getPileupString((char)ref.getBase()), rods); - if ( SHOW_VERBOSE ) - out.printf(" %s", createVerboseOutput(basePileup)); - out.println(); - return 1; + final StringBuilder s = new StringBuilder(); + s.append(String.format("%s %s", basePileup.getPileupString((char)ref.getBase()), rods)); + if ( SHOW_VERBOSE ) + s.append(" ").append(createVerboseOutput(basePileup)); + s.append("\n"); + + return s.toString(); } // Given result of map function + @Override public Integer reduceInit() { return 0; } - public Integer reduce(Integer value, Integer sum) { - return treeReduce(sum,value); + + @Override + public Integer reduce(String value, Integer sum) { + out.print(value); + return sum + 1; } + + @Override public Integer treeReduce(Integer lhs, Integer rhs) { return lhs + rhs; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java index 8257794d7..37176cbf9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/PrintReads.java @@ -32,17 +32,16 @@ import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; -import java.util.Collection; -import java.util.Random; -import java.util.Set; -import java.util.TreeSet; +import java.util.*; /** * Renders, in SAM/BAM format, all reads from the input data set in the order in which they appear in the input file. @@ -91,9 +90,10 @@ import java.util.TreeSet; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) +@ReadTransformersMode(ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.HANDLED_IN_WALKER) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class PrintReads extends ReadWalker { +public class PrintReads extends ReadWalker implements NanoSchedulable { @Output(doc="Write output to this BAM filename instead of STDOUT", required = true) SAMFileWriter out; @@ -138,6 +138,7 @@ public class PrintReads extends ReadWalker { public boolean simplifyReads = false; + List readTransformers = Collections.emptyList(); private TreeSet samplesToChoose = new TreeSet(); private boolean SAMPLES_SPECIFIED = false; @@ -150,6 +151,9 @@ public class PrintReads extends ReadWalker { if ( platform != null ) platform = platform.toUpperCase(); + if ( getToolkit() != null ) + readTransformers = getToolkit().getReadTransformers(); + Collection samplesFromFile; if (!sampleFile.isEmpty()) { samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFile); @@ -217,11 +221,19 @@ public class PrintReads extends ReadWalker { * The reads map function. * * @param ref the reference bases that correspond to our read, if a reference was provided - * @param read the read itself, as a GATKSAMRecord + * @param readIn the read itself, as a GATKSAMRecord * @return the read itself */ - public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker ) { - return simplifyReads ? read.simplify() : read; + public GATKSAMRecord map( ReferenceContext ref, GATKSAMRecord readIn, RefMetaDataTracker metaDataTracker ) { + GATKSAMRecord workingRead = readIn; + + for ( final ReadTransformer transformer : readTransformers ) { + workingRead = transformer.apply(workingRead); + } + + if ( simplifyReads ) workingRead = workingRead.simplify(); + + return workingRead; } /** @@ -245,5 +257,4 @@ public class PrintReads extends ReadWalker { output.addAlignment(read); return output; } - } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java index 77e3af93f..42fbb32bd 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/ReadWalker.java @@ -1,8 +1,7 @@ package org.broadinstitute.sting.gatk.walkers; -import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /** @@ -27,5 +26,5 @@ public abstract class ReadWalker extends Walker { final protected static Logger logger = Logger.getLogger(Walker.class); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java index 30f81b20c..18bdb02ed 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalance.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -51,7 +52,12 @@ public class AlleleBalance extends InfoFieldAnnotation { char[] BASES = {'A','C','G','T'}; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java index 11c9c3a99..4d79c4112 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AlleleBalanceBySample.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -24,7 +25,17 @@ import java.util.List; */ public class AlleleBalanceBySample extends GenotypeAnnotation implements ExperimentalAnnotation { - public void annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, final GenotypeBuilder gb) { + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ + if ( stratifiedContext == null ) + return; + Double ratio = annotateSNP(stratifiedContext, vc, g); if (ratio == null) return; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java index c3b6de65a..aef3e49cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseCounts.java @@ -36,6 +36,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -52,7 +53,12 @@ import java.util.Map; */ public class BaseCounts extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java index bd884892c..e59fc827d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/BaseQualityRankSumTest.java @@ -1,12 +1,11 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import java.util.*; @@ -14,73 +13,44 @@ import java.util.*; /** * The u-based z-approximation from the Mann-Whitney Rank Sum Test for base qualities (ref bases vs. bases of the alternate allele). - * Note that the base quality rank sum test can not be calculated for homozygous sites. + * Note that the base quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. */ public class BaseQualityRankSumTest extends RankSumTest implements StandardAnnotation { public List getKeyNames() { return Arrays.asList("BaseQRankSum"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("BaseQRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - for ( final PileupElement p : pileup ) { - if( isUsableBase(p) ) { - if ( p.getBase() == ref ) - refQuals.add((double)p.getQual()); - else if ( alts.contains(p.getBase()) ) - altQuals.add((double)p.getQual()); - } - } - } - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - // TODO -- implement me; how do we pull out the correct offset from the read? - return; - -/* - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; - - for ( final GATKSAMRecord read : alleleBin.getValue() ) { + protected void fillQualsFromPileup(final List allAlleles, final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap, + final List refQuals, final List altQuals){ + if (alleleLikelihoodMap == null) { + // use fast SNP-based version if we don't have per-read allele likelihoods + for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { - if ( matchesRef ) + if ( allAlleles.get(0).equals(Allele.create(p.getBase(),true)) ) { refQuals.add((double)p.getQual()); - else + } else if ( allAlleles.contains(Allele.create(p.getBase()))) { altQuals.add((double)p.getQual()); - } - } - } -*/ - } - - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? - HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p)) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Map.Entry entry : el.entrySet()) { - - if (entry.getKey().isReference()) - refLikelihood = entry.getValue(); - else { - double like = entry.getValue(); - if (like >= altLikelihood) - altLikelihood = like; } } - if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) - refQuals.add(-10.0*refLikelihood); - else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) - altQuals.add(-10.0*altLikelihood); } + return; + } + + for (Map el : alleleLikelihoodMap.getLikelihoodMapValues()) { + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add(-10.0*(double)el.get(a)); + else if (allAlleles.contains(a)) + altQuals.add(-10.0*(double)el.get(a)); + + } } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java index 54837baad..0c78c0204 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ChromosomeCounts.java @@ -34,12 +34,11 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; @@ -61,7 +60,12 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn private Set founderIds = new HashSet(); - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { if ( ! vc.hasGenotypes() ) return null; @@ -73,13 +77,6 @@ public class ChromosomeCounts extends InfoFieldAnnotation implements StandardAnn founderIds = ((Walker)walker).getSampleDB().getFounderIds(); } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( ! vc.hasGenotypes() ) - return null; - - return VariantContextUtils.calculateChromosomeCounts(vc, new HashMap(), true); - } - public List getKeyNames() { return Arrays.asList(keyNames); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java index f41a40621..1dff4d1a3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ClippingRankSumTest.java @@ -1,10 +1,8 @@ package org.broadinstitute.sting.gatk.walkers.annotator; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -18,74 +16,36 @@ import java.util.*; * Date: 6/28/12 */ +/** + * The u-based z-approximation from the Mann-Whitney Rank Sum Test for reads with clipped bases (reads with ref bases vs. those with the alternate allele) + * Note that the clipping rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. + */ public class ClippingRankSumTest extends RankSumTest { public List getKeyNames() { return Arrays.asList("ClippingRankSum"); } public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("ClippingRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - return; - // This working implementation below needs to be tested for the UG pipeline - /* - for ( final PileupElement p : pileup ) { - if ( isUsableBase(p) ) { - if ( p.getBase() == ref ) { - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - } else if ( alts.contains(p.getBase()) ) { - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - } - } - } - */ - } - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; + protected void fillQualsFromPileup(final List allAlleles, + final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap likelihoodMap, final List refQuals, final List altQuals) { + // todo - only support non-pileup case for now, e.g. active-region based version + if (pileup != null || likelihoodMap == null) + return; + + for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { + + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); + else if (allAlleles.contains(a)) + altQuals.add((double)AlignmentUtils.getNumHardClippedBases(el.getKey())); - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - if ( matchesRef ) - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(read)); - else - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(read)); - } } } - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - return; - // This working implementation below needs to be tested for the UG pipeline - - /* - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? - HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() != 0 && p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Allele a : el.keySet()) { - - if (a.isReference()) - refLikelihood =el.get(a); - else { - double like = el.get(a); - if (like >= altLikelihood) - altLikelihood = like; - } - } - if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) - refQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) - altQuals.add((double)AlignmentUtils.getNumHardClippedBases(p.getRead())); - } - } - */ - } -} + } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java index 39b5e84dc..c9481f244 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthOfCoverage.java @@ -7,10 +7,12 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -29,28 +31,34 @@ import java.util.Map; */ public class DepthOfCoverage extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { int depth = 0; - for ( Map.Entry sample : stratifiedContexts.entrySet() ) - depth += sample.getValue().getBasePileup().depthOfCoverage(); - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%d", depth)); - return map; - } + if (stratifiedContexts != null) { + if ( stratifiedContexts.size() == 0 ) + return null; - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + for ( Map.Entry sample : stratifiedContexts.entrySet() ) + depth += sample.getValue().getBasePileup().depthOfCoverage(); + } + else if (perReadAlleleLikelihoodMap != null) { + if ( perReadAlleleLikelihoodMap.size() == 0 ) + return null; - int depth = 0; - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final List alleleBin : alleleBins.values() ) { - depth += alleleBin.size(); + for (PerReadAlleleLikelihoodMap maps : perReadAlleleLikelihoodMap.values() ) { + for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = el.getKey(); + depth += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); + } } } + else + return null; Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%d", depth)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java index 61e30f3b9..89a239e54 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/DepthPerAlleleBySample.java @@ -6,11 +6,14 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -19,6 +22,7 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; import java.util.HashMap; import java.util.List; +import java.util.Map; /** @@ -34,25 +38,33 @@ import java.util.List; * the reads. If, for example, I believe there really is a an A/T polymorphism at a site, then I would like * to know the counts of A and T bases in this sample, even for reads with poor mapping quality that would * normally be excluded from the statistical calculations going into GQ and QUAL. Please note, however, that - * the AD isn't necessarily calculated exactly for indels (it counts as non-reference only those indels that - * are unambiguously informative about the alternate allele). Because of this fact and - * because the AD includes reads and bases that were filtered by the Unified Genotyper, one should not base - * assumptions about the underlying genotype based on it; instead, the genotype likelihoods (PLs) are what - * determine the genotype calls. + * the AD isn't necessarily calculated exactly for indels. Only reads which are statistically favoring one allele over the other are counted. + * Because of this fact, the sum of AD may be different than the individual sample depth, especially when there are + * many non-informatice reads. + * Because the AD includes reads and bases that were filtered by the Unified Genotyper and in case of indels is based on a statistical computation, + * one should not base assumptions about the underlying genotype based on it; + * instead, the genotype likelihoods (PLs) are what determine the genotype calls. */ public class DepthPerAlleleBySample extends GenotypeAnnotation implements StandardAnnotation { - public void annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, AlignmentContext stratifiedContext, VariantContext vc, Genotype g, GenotypeBuilder gb) { - if ( g == null || !g.isCalled() ) + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap) { + if ( g == null || !g.isCalled() || ( stratifiedContext == null && alleleLikelihoodMap == null) ) return; - if ( vc.isSNP() ) - annotateSNP(stratifiedContext, vc, gb); - else if ( vc.isIndel() ) - annotateIndel(stratifiedContext, ref.getBase(), vc, gb); + if (alleleLikelihoodMap != null && !alleleLikelihoodMap.isEmpty()) + annotateWithLikelihoods(alleleLikelihoodMap, vc, gb); + else if ( stratifiedContext != null && (vc.isSNP())) + annotateWithPileup(stratifiedContext, vc, gb); } - private void annotateSNP(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { + private void annotateWithPileup(final AlignmentContext stratifiedContext, final VariantContext vc, final GenotypeBuilder gb) { HashMap alleleCounts = new HashMap(); for ( Allele allele : vc.getAlleles() ) @@ -61,7 +73,7 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa ReadBackedPileup pileup = stratifiedContext.getBasePileup(); for ( PileupElement p : pileup ) { if ( alleleCounts.containsKey(p.getBase()) ) - alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+1); + alleleCounts.put(p.getBase(), alleleCounts.get(p.getBase())+p.getRepresentativeCount()); } // we need to add counts in the correct order @@ -73,52 +85,29 @@ public class DepthPerAlleleBySample extends GenotypeAnnotation implements Standa gb.AD(counts); } - private void annotateIndel(final AlignmentContext stratifiedContext, final byte refBase, final VariantContext vc, final GenotypeBuilder gb) { - ReadBackedPileup pileup = stratifiedContext.getBasePileup(); - if ( pileup == null ) - return; - + private void annotateWithLikelihoods(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final VariantContext vc, final GenotypeBuilder gb) { final HashMap alleleCounts = new HashMap(); - final Allele refAllele = vc.getReference(); for ( final Allele allele : vc.getAlleles() ) { alleleCounts.put(allele, 0); } - - for ( PileupElement p : pileup ) { - if ( p.isBeforeInsertion() ) { - - final String eventBases = p.getEventBases(); - if ( eventBases == null ) - continue; - - final Allele insertion = Allele.create((char)refBase + eventBases, false); - if ( alleleCounts.containsKey(insertion) ) { - alleleCounts.put(insertion, alleleCounts.get(insertion)+1); - } - - } else if ( p.isBeforeDeletionStart() ) { - if ( p.getEventLength() == refAllele.length() - 1 ) { - // this is indeed the deletion allele recorded in VC - final Allele deletion = Allele.create(refBase); - if ( alleleCounts.containsKey(deletion) ) { - alleleCounts.put(deletion, alleleCounts.get(deletion)+1); - } - } - } else if ( p.getRead().getAlignmentEnd() > vc.getStart() ) { - alleleCounts.put(refAllele, alleleCounts.get(refAllele)+1); - } + for (Map.Entry> el : perReadAlleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = el.getKey(); + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (!vc.getAlleles().contains(a)) + continue; // sanity check - shouldn't be needed + alleleCounts.put(a, alleleCounts.get(a) + (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1)); } - final int[] counts = new int[alleleCounts.size()]; - counts[0] = alleleCounts.get(refAllele); + counts[0] = alleleCounts.get(vc.getReference()); for (int i = 0; i < vc.getAlternateAlleles().size(); i++) counts[i+1] = alleleCounts.get( vc.getAlternateAllele(i) ); gb.AD(counts); } - // public String getIndelBases() public List getKeyNames() { return Arrays.asList(VCFConstants.GENOTYPE_ALLELE_DEPTHS); } public List getDescriptions() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java index 131670599..bdf7baec9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/FisherStrand.java @@ -32,13 +32,13 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -54,46 +54,64 @@ import java.util.*; public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private static final String FS = "FS"; private static final double MIN_PVALUE = 1E-320; + private static final int MIN_QUAL_FOR_FILTERED_TEST = 17; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( !vc.isVariant() ) return null; - int[][] table; - - if ( vc.isSNP() ) - table = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); - else if ( vc.isIndel() || vc.isMixed() ) { - table = getIndelContingencyTable(stratifiedContexts); - if (table == null) - return null; + if (vc.isSNP() && stratifiedContexts != null) { + final int[][] tableNoFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), -1); + final int[][] tableFiltering = getSNPContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount(), MIN_QUAL_FOR_FILTERED_TEST); + return pValueForBestTable(tableFiltering, tableNoFiltering); + } + else if (stratifiedPerReadAlleleLikelihoodMap != null) { + // either SNP with no alignment context, or indels: per-read likelihood map needed + final int[][] table = getContingencyTable(stratifiedPerReadAlleleLikelihoodMap, vc); + return pValueForBestTable(table, null); } else + // for non-snp variants, we need per-read likelihoods. + // for snps, we can get same result from simple pileup return null; - - Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); - if ( pvalue == null ) - return null; - - Map map = new HashMap(); - map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); - return map; } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( !vc.isVariant() ) - return null; - - final int[][] table = getContingencyTable(stratifiedContexts, vc.getReference(), vc.getAltAlleleWithHighestAlleleCount()); - - final Double pvalue = Math.max(pValueForContingencyTable(table), MIN_PVALUE); - if ( pvalue == null ) - return null; - - final Map map = new HashMap(); - map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pvalue))); - return map; + /** + * Create an annotation for the highest (i.e., least significant) p-value of table1 and table2 + * + * @param table1 a contingency table, may be null + * @param table2 a contingency table, may be null + * @return annotation result for FS given tables + */ + private Map pValueForBestTable(final int[][] table1, final int[][] table2) { + if ( table2 == null ) + return table1 == null ? null : annotationForOneTable(pValueForContingencyTable(table1)); + else if (table1 == null) + return annotationForOneTable(pValueForContingencyTable(table2)); + else { // take the one with the best (i.e., least significant pvalue) + double pvalue1 = Math.max(pValueForContingencyTable(table1), MIN_PVALUE); + double pvalue2 = Math.max(pValueForContingencyTable(table2), MIN_PVALUE); + return annotationForOneTable(Math.max(pvalue1, pvalue2)); + } + } + /** + * Returns an annotation result given a pValue + * + * @param pValue + * @return a hash map from FS -> phred-scaled pValue + */ + private Map annotationForOneTable(final double pValue) { + final Object value = String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue)); + return Collections.singletonMap(FS, value); +// Map map = new HashMap(); +// map.put(FS, String.format("%.3f", QualityUtils.phredScaleErrorRate(pValue))); +// return map; } public List getKeyNames() { @@ -161,7 +179,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat table[0][1] += 1; table[1][1] -= 1; - return (table[0][0] >= 0 && table[1][1] >= 0) ? true : false; + return (table[0][0] >= 0 && table[1][1] >= 0); } private static boolean unrotateTable(int[][] table) { @@ -171,7 +189,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat table[0][1] -= 1; table[1][1] += 1; - return (table[0][1] >= 0 && table[1][0] >= 0) ? true : false; + return (table[0][1] >= 0 && table[1][0] >= 0); } private static double computePValue(int[][] table) { @@ -218,31 +236,31 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getContingencyTable(Map>> stratifiedContexts, Allele ref, Allele alt) { + private static int[][] getContingencyTable( final Map stratifiedPerReadAlleleLikelihoodMap, final VariantContext vc) { + final Allele ref = vc.getReference(); + final Allele alt = vc.getAltAlleleWithHighestAlleleCount(); int[][] table = new int[2][2]; - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { + for (PerReadAlleleLikelihoodMap maps : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (Map.Entry> el : maps.getLikelihoodReadMap().entrySet()) { + final boolean matchesRef = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(ref,true); + final boolean matchesAlt = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()).equals(alt,true); - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alt.equals(alleleBin.getKey()); if ( !matchesRef && !matchesAlt ) continue; - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - boolean isFW = read.getReadNegativeStrandFlag(); + boolean isFW = el.getKey().getReadNegativeStrandFlag(); - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; + int row = matchesRef ? 0 : 1; + int column = isFW ? 0 : 1; - table[row][column]++; - } + final GATKSAMRecord read = el.getKey(); + table[row][column] += (read.isReducedRead() ? read.getReducedCount(ReadUtils.getReadCoordinateForReferenceCoordinate(read, vc.getStart(), ReadUtils.ClippingTail.RIGHT_TAIL)) : 1); } } return table; } - /** Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: * fw rc @@ -250,16 +268,22 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat * allele2 # # * @return a 2x2 contingency table */ - private static int[][] getSNPContingencyTable(Map stratifiedContexts, Allele ref, Allele alt) { + private static int[][] getSNPContingencyTable(final Map stratifiedContexts, + final Allele ref, + final Allele alt, + final int minQScoreToConsider ) { int[][] table = new int[2][2]; for ( Map.Entry sample : stratifiedContexts.entrySet() ) { for (PileupElement p : sample.getValue().getBasePileup()) { - if ( ! RankSumTest.isUsableBase(p, false) || p.getRead().isReducedRead() ) // ignore deletions and reduced reads + if ( ! RankSumTest.isUsableBase(p, false) ) // ignore deletions continue; - Allele base = Allele.create(p.getBase(), false); - boolean isFW = !p.getRead().getReadNegativeStrandFlag(); + if ( p.getQual() < minQScoreToConsider || p.getMappingQual() < minQScoreToConsider ) + continue; + + final Allele base = Allele.create(p.getBase(), false); + final boolean isFW = !p.getRead().getReadNegativeStrandFlag(); final boolean matchesRef = ref.equals(base, true); final boolean matchesAlt = alt.equals(base, true); @@ -267,73 +291,7 @@ public class FisherStrand extends InfoFieldAnnotation implements StandardAnnotat int row = matchesRef ? 0 : 1; int column = isFW ? 0 : 1; - table[row][column]++; - } - } - } - - return table; - } - - /** - Allocate and fill a 2x2 strand contingency table. In the end, it'll look something like this: - * fw rc - * allele1 # # - * allele2 # # - * @return a 2x2 contingency table - */ - private static int[][] getIndelContingencyTable(Map stratifiedContexts) { - final double INDEL_LIKELIHOOD_THRESH = 0.3; - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - - if (indelLikelihoodMap == null) - return null; - - int[][] table = new int[2][2]; - - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - final AlignmentContext context = sample.getValue(); - if ( context == null ) - continue; - - final ReadBackedPileup pileup = context.getBasePileup(); - for ( final PileupElement p : pileup ) { - if ( ! RankSumTest.isUsableBase(p, true) || p.getRead().isReducedRead() ) // ignore reduced reads - continue; - if ( indelLikelihoodMap.containsKey(p) ) { - // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. - // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). - // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. - // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pileup element is "ref" - // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - boolean isFW = !p.getRead().getReadNegativeStrandFlag(); - - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Map.Entry entry : el.entrySet()) { - - if (entry.getKey().isReference()) - refLikelihood = entry.getValue(); - else { - double like = entry.getValue(); - if (like >= altLikelihood) - altLikelihood = like; - } - } - - boolean matchesRef = (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)); - boolean matchesAlt = (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)); - if ( matchesRef || matchesAlt ) { - int row = matchesRef ? 0 : 1; - int column = isFW ? 0 : 1; - - table[row][column]++; - } - - + table[row][column] += p.getRepresentativeCount(); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index fba30b3f7..07391c78c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -25,7 +26,12 @@ import java.util.Map; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) public class GCContent extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { double content = computeGCContent(ref); Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%.2f", content)); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index c6d8883c5..ca7180510 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -28,10 +28,11 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; @@ -47,6 +48,7 @@ import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.Serializable; import java.util.*; /** @@ -54,17 +56,31 @@ import java.util.*; * are indicative of regions with bad alignments, often leading to artifactual SNP and indel calls. * Note that the Haplotype Score is only calculated for sites with read coverage. */ -public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation { +public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { private final static boolean DEBUG = false; private final static int MIN_CONTEXT_WING_SIZE = 10; private final static int MAX_CONSENSUS_HAPLOTYPES_TO_CONSIDER = 50; private final static char REGEXP_WILDCARD = '.'; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + if (vc.isSNP() && stratifiedContexts != null) + return annotatePileup(ref, stratifiedContexts, vc); + else if (stratifiedPerReadAlleleLikelihoodMap != null && vc.isVariant()) + return annotateWithLikelihoods(stratifiedPerReadAlleleLikelihoodMap, vc); + else return null; + } - if (!vc.isSNP() && !vc.isIndel() && !vc.isMixed()) + private Map annotatePileup(final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { + + if (stratifiedContexts.size() == 0) // size 0 means that call was made by someone else and we have no data here return null; final AlignmentContext context = AlignmentContextUtils.joinContexts(stratifiedContexts.values()); @@ -85,14 +101,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot final AlignmentContext thisContext = stratifiedContexts.get(genotype.getSampleName()); if (thisContext != null) { final ReadBackedPileup thisPileup = thisContext.getBasePileup(); - if (vc.isSNP()) - scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - else if (vc.isIndel() || vc.isMixed()) { - Double d = scoreIndelsAgainstHaplotypes(thisPileup); - if (d == null) - return null; - scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense - } + scoreRA.add(scoreReadsAgainstHaplotypes(haplotypes, thisPileup, contextSize, locus)); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense } } } @@ -103,7 +112,32 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot return map; } - private static class HaplotypeComparator implements Comparator { + private Map annotateWithLikelihoods(final Map stratifiedPerReadAlleleLikelihoodMap, + final VariantContext vc) { + + final MathUtils.RunningAverage scoreRA = new MathUtils.RunningAverage(); + for (final Genotype genotype : vc.getGenotypes()) { + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if (perReadAlleleLikelihoodMap == null) + continue; + + Double d = scoreIndelsAgainstHaplotypes(perReadAlleleLikelihoodMap); + if (d == null) + continue; + scoreRA.add(d); // Taking the simple average of all sample's score since the score can be negative and the RMS doesn't make sense + } + + // if (scoreRA.observationCount() == 0) + // return null; + + // annotate the score in the info field + final Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%.4f", scoreRA.mean())); + return map; + + } + + private static class HaplotypeComparator implements Comparator, Serializable { public int compare(Haplotype a, Haplotype b) { if (a.getQualitySum() < b.getQualitySum()) @@ -177,7 +211,6 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot private Haplotype getHaplotypeFromRead(final PileupElement p, final int contextSize, final int locus) { final GATKSAMRecord read = p.getRead(); - int readOffsetFromPileup = p.getOffset(); final byte[] haplotypeBases = new byte[contextSize]; Arrays.fill(haplotypeBases, (byte) REGEXP_WILDCARD); @@ -189,7 +222,7 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot byte[] readQuals = read.getBaseQualities(); readQuals = AlignmentUtils.readToAlignmentByteArray(read.getCigar(), readQuals); // Shift the location of the qual scores based on the Cigar string - readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), p, read.getAlignmentStart(), locus); + final int readOffsetFromPileup = AlignmentUtils.calcAlignmentByteArrayOffset(read.getCigar(), p, read.getAlignmentStart(), locus); final int baseOffsetStart = readOffsetFromPileup - (contextSize - 1) / 2; for (int i = 0; i < contextSize; i++) { @@ -346,31 +379,26 @@ public class HaplotypeScore extends InfoFieldAnnotation implements StandardAnnot } - private Double scoreIndelsAgainstHaplotypes(final ReadBackedPileup pileup) { + private Double scoreIndelsAgainstHaplotypes(final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) { final ArrayList haplotypeScores = new ArrayList(); - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - - if (indelLikelihoodMap == null) + if (perReadAlleleLikelihoodMap.isEmpty()) return null; - for (final PileupElement p : pileup) { - if (indelLikelihoodMap.containsKey(p)) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); + for (Map el : perReadAlleleLikelihoodMap.getLikelihoodMapValues()) { - // Score all the reads in the pileup, even the filtered ones - final double[] scores = new double[el.size()]; - int i = 0; - for (Map.Entry a : el.entrySet()) { - scores[i++] = -a.getValue(); - if (DEBUG) { - System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); - } + // retrieve likelihood information corresponding to this read + // Score all the reads in the pileup, even the filtered ones + final double[] scores = new double[el.size()]; + int i = 0; + for (Map.Entry a : el.entrySet()) { + scores[i++] = -a.getValue(); + if (DEBUG) { + System.out.printf(" vs. haplotype %d = %f%n", i - 1, scores[i - 1]); } - - haplotypeScores.add(scores); } + + haplotypeScores.add(scores); } // indel likelihoods are strict log-probs, not phred scored diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java index 6ba85de07..0340f457c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HardyWeinberg.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.WorkInProgressAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -29,7 +30,12 @@ public class HardyWeinberg extends InfoFieldAnnotation implements WorkInProgress private static final int MIN_GENOTYPE_QUALITY = 10; private static final int MIN_LOG10_PERROR = MIN_GENOTYPE_QUALITY / 10; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { final GenotypesContext genotypes = vc.getGenotypes(); if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java index 9f20bf375..037b357ae 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HomopolymerRun.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -22,7 +23,12 @@ public class HomopolymerRun extends InfoFieldAnnotation { private boolean ANNOTATE_INDELS = true; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( !vc.isBiallelic() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java index 715895526..dd058b469 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/InbreedingCoeff.java @@ -8,11 +8,10 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -33,20 +32,21 @@ public class InbreedingCoeff extends InfoFieldAnnotation implements StandardAnno private static final int MIN_SAMPLES = 10; private Set founderIds; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { //If available, get the founder IDs and cache them. the IC will only be computed on founders then. - if(founderIds == null) + if(founderIds == null && walker != null) founderIds = ((Walker)walker).getSampleDB().getFounderIds(); return calculateIC(vc); } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - return calculateIC(vc); - } - private Map calculateIC(final VariantContext vc) { final GenotypesContext genotypes = (founderIds == null || founderIds.isEmpty()) ? vc.getGenotypes() : vc.getGenotypes(founderIds); - if ( genotypes == null || genotypes.size() < MIN_SAMPLES ) + if ( genotypes == null || genotypes.size() < MIN_SAMPLES || !vc.isVariant()) return null; int idxAA = 0, idxAB = 1, idxBB = 2; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java index babaf7ee6..c67d829c2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/IndelType.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.IndelUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -18,9 +19,14 @@ import java.util.*; */ public class IndelType extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { - int run; + int run; if (vc.isMixed()) { Map map = new HashMap(); map.put(getKeyNames().get(0), String.format("%s", "MIXED")); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java index 7f5033adf..c9a4d0ee6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/LowMQ.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -21,7 +22,12 @@ import java.util.Map; */ public class LowMQ extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java index b6f24433e..c9d5ca261 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java @@ -3,13 +3,13 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.samples.Sample; -import org.broadinstitute.sting.gatk.samples.SampleDB; +import org.broadinstitute.sting.gatk.samples.Trio; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -19,64 +19,67 @@ import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.*; /** - * Created by IntelliJ IDEA. - * User: chartl - * Date: 9/14/11 - * Time: 12:24 PM + * Given a variant context, uses the genotype likelihoods to assess the likelihood of the site being a mendelian violation + * versus the likelihood of the site transmitting according to mendelian rules. This assumes that the organism is + * diploid. When multiple trios are present, the annotation is simply the maximum of the likelihood ratios, rather than + * the strict 1-Prod(1-p_i) calculation, as this can scale poorly for uncertain sites and many trios. */ public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation, RodRequiringAnnotation { private MendelianViolation mendelianViolation = null; - private String motherId; - private String fatherId; - private String childId; + public static final String MVLR_KEY = "MVLR"; + private Set trios; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( mendelianViolation == null ) { - if (checkAndSetSamples(((Walker) walker).getSampleDB())) { + trios = ((Walker) walker).getSampleDB().getTrios(); + if ( trios.size() > 0 ) { mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).minGenotypeQualityP ); } else { - throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line containing only 1 trio."); + throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator, and must be provided a valid PED file (-ped) from the command line."); } } - Map toRet = new HashMap(1); - boolean hasAppropriateGenotypes = vc.hasGenotype(motherId) && vc.getGenotype(motherId).hasLikelihoods() && - vc.hasGenotype(fatherId) && vc.getGenotype(fatherId).hasLikelihoods() && - vc.hasGenotype(childId) && vc.getGenotype(childId).hasLikelihoods(); - if ( hasAppropriateGenotypes ) - toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc,motherId,fatherId,childId)); + Map attributeMap = new HashMap(1); + //double pNoMV = 1.0; + double maxMVLR = Double.MIN_VALUE; + for ( Trio trio : trios ) { + if ( contextHasTrioLikelihoods(vc,trio) ) { + Double likR = mendelianViolation.violationLikelihoodRatio(vc,trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()); + maxMVLR = likR > maxMVLR ? likR : maxMVLR; + //pNoMV *= (1.0-Math.pow(10.0,likR)/(1+Math.pow(10.0,likR))); + } + } - return toRet; + //double pSomeMV = 1.0-pNoMV; + //toRet.put("MVLR",Math.log10(pSomeMV)-Math.log10(1.0-pSomeMV)); + if ( Double.compare(maxMVLR,Double.MIN_VALUE) != 0 ) + attributeMap.put(MVLR_KEY,maxMVLR); + return attributeMap; } // return the descriptions used for the VCF INFO meta field - public List getKeyNames() { return Arrays.asList("MVLR"); } + public List getKeyNames() { return Arrays.asList(MVLR_KEY); } - public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } + public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine(MVLR_KEY, 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); } - private boolean checkAndSetSamples(SampleDB db){ - Set families = db.getFamilyIDs(); - if(families.size() != 1) - return false; - Set family = db.getFamily(families.iterator().next()); - if(family.size() != 3) - return false; - - Iterator sampleIter = family.iterator(); - Sample sample; - for(sample = sampleIter.next();sampleIter.hasNext();sample=sampleIter.next()){ - if(sample.getParents().size()==2){ - motherId = sample.getMaternalID(); - fatherId = sample.getPaternalID(); - childId = sample.getID(); - return true; - } + private boolean contextHasTrioLikelihoods(VariantContext context, Trio trio) { + for ( String sample : Arrays.asList(trio.getMaternalID(),trio.getPaternalID(),trio.getChildID()) ) { + if ( ! context.hasGenotype(sample) ) + return false; + if ( ! context.getGenotype(sample).hasLikelihoods() ) + return false; } - return false; + + return true; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 31067e386..82596a501 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -1,8 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; -import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -15,7 +14,7 @@ import java.util.*; /** * The u-based z-approximation from the Mann-Whitney Rank Sum Test for mapping qualities (reads with ref bases vs. those with the alternate allele) - * Note that the mapping quality rank sum test can not be calculated for homozygous sites. + * Note that the mapping quality rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. */ public class MappingQualityRankSumTest extends RankSumTest implements StandardAnnotation { @@ -23,60 +22,36 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn public List getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MQRankSum", 1, VCFHeaderLineType.Float, "Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - for ( final PileupElement p : pileup ) { - if ( isUsableBase(p) ) { - if ( p.getBase() == ref ) { - refQuals.add((double)p.getMappingQual()); - } else if ( alts.contains(p.getBase()) ) { - altQuals.add((double)p.getMappingQual()); - } - } - } - } + protected void fillQualsFromPileup(final List allAlleles, + final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap likelihoodMap, + final List refQuals, final List altQuals) { - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; - - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - if ( matchesRef ) - refQuals.add((double)read.getMappingQuality()); - else - altQuals.add((double)read.getMappingQuality()); - } - } - } - - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele ? - HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p: pileup) { - if (indelLikelihoodMap.containsKey(p) && p.getMappingQual() != 0 && p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE) { - // retrieve likelihood information corresponding to this read - LinkedHashMap el = indelLikelihoodMap.get(p); - // by design, first element in LinkedHashMap was ref allele - double refLikelihood=0.0, altLikelihood=Double.NEGATIVE_INFINITY; - - for (Map.Entry a : el.entrySet()) { - - if (a.getKey().isReference()) - refLikelihood = a.getValue(); - else { - double like = a.getValue(); - if (like >= altLikelihood) - altLikelihood = like; + if (pileup != null && likelihoodMap == null) { + // no per-read likelihoods available: + for ( final PileupElement p : pileup ) { + if ( isUsableBase(p) ) { + if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { + refQuals.add((double)p.getMappingQual()); + } else if ( allAlleles.contains(Allele.create(p.getBase()))) { + altQuals.add((double)p.getMappingQual()); } } - if (refLikelihood > altLikelihood + INDEL_LIKELIHOOD_THRESH) - refQuals.add((double)p.getMappingQual()); - else if (altLikelihood > refLikelihood + INDEL_LIKELIHOOD_THRESH) - altQuals.add((double)p.getMappingQual()); } + return; + } + for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add((double)el.getKey().getMappingQuality()); + else if (allAlleles.contains(a)) + altQuals.add((double)el.getKey().getMappingQuality()); + + } } - -} \ No newline at end of file + + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java index 372d5bc9e..364bbdbb9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZero.java @@ -3,14 +3,17 @@ package org.broadinstitute.sting.gatk.walkers.annotator; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.Arrays; @@ -22,9 +25,25 @@ import java.util.Map; /** * Total count across all samples of mapping quality zero reads */ -public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation { +public class MappingQualityZero extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + if ((vc.isSNP() || !vc.isVariant()) && stratifiedContexts != null) + return annotatePileup(ref, stratifiedContexts, vc); + else if (stratifiedPerReadAlleleLikelihoodMap != null && vc.isVariant()) + return annotateWithLikelihoods(stratifiedPerReadAlleleLikelihoodMap, vc); + else + return null; + } + + private Map annotatePileup(final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { if ( stratifiedContexts.size() == 0 ) return null; @@ -42,6 +61,25 @@ public class MappingQualityZero extends InfoFieldAnnotation implements StandardA return map; } + private Map annotateWithLikelihoods(final Map stratifiedPerReadAlleleLikelihoodMap, + final VariantContext vc) { + if (stratifiedPerReadAlleleLikelihoodMap == null) + return null; + + int mq0 = 0; + for ( PerReadAlleleLikelihoodMap likelihoodMap : stratifiedPerReadAlleleLikelihoodMap.values() ) { + for (GATKSAMRecord read : likelihoodMap.getLikelihoodReadMap().keySet()) { + + if (read.getMappingQuality() == 0 ) + mq0++; + } + } + Map map = new HashMap(); + map.put(getKeyNames().get(0), String.format("%d", mq0)); + return map; + } + + public List getKeyNames() { return Arrays.asList(VCFConstants.MAPPING_QUALITY_ZERO_KEY); } public List getDescriptions() { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java index b5252f15b..afb4ceb60 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroBySample.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.GenotypeAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -46,14 +47,19 @@ import java.util.List; * Count for each sample of mapping quality zero reads */ public class MappingQualityZeroBySample extends GenotypeAnnotation { - public void annotate(RefMetaDataTracker tracker, - AnnotatorCompatible walker, ReferenceContext ref, AlignmentContext context, - VariantContext vc, Genotype g, GenotypeBuilder gb) { - if ( g == null || !g.isCalled() ) + public void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap){ + if ( g == null || !g.isCalled() || stratifiedContext == null ) return; int mq0 = 0; - final ReadBackedPileup pileup = context.getBasePileup(); + final ReadBackedPileup pileup = stratifiedContext.getBasePileup(); for (PileupElement p : pileup ) { if ( p.getMappingQual() == 0 ) mq0++; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java index 9f542e3bd..5f9f3416d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityZeroFraction.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -22,7 +23,12 @@ import java.util.Map; */ public class MappingQualityZeroFraction extends InfoFieldAnnotation implements ExperimentalAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java index ba4303b4a..3e6aa62a2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/NBaseCount.java @@ -5,6 +5,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -20,7 +21,12 @@ import java.util.Map; * The number of N bases, counting only SOLiD data */ public class NBaseCount extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index b62cd374b..d75947879 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -7,10 +7,9 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -28,8 +27,13 @@ import java.util.Map; */ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( !vc.hasLog10PError() || stratifiedContexts.size() == 0 ) + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + if ( !vc.hasLog10PError() ) return null; final GenotypesContext genotypes = vc.getGenotypes(); @@ -44,11 +48,20 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( !genotype.isHet() && !genotype.isHomVar() ) continue; - AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; + if (stratifiedContexts!= null) { + AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + if ( context == null ) + continue; + depth += context.getBasePileup().depthOfCoverage(); - depth += context.getBasePileup().depthOfCoverage(); + } + else if (perReadAlleleLikelihoodMap != null) { + PerReadAlleleLikelihoodMap perReadAlleleLikelihoods = perReadAlleleLikelihoodMap.get(genotype.getSampleName()); + if (perReadAlleleLikelihoods == null || perReadAlleleLikelihoods.isEmpty()) + continue; + + depth += perReadAlleleLikelihoods.getNumberOfStoredElements(); + } } if ( depth == 0 ) @@ -67,39 +80,5 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if ( genotypes == null || genotypes.size() == 0 ) - return null; - - int depth = 0; - - for ( final Genotype genotype : genotypes ) { - - // we care only about variant calls with likelihoods - if ( !genotype.isHet() && !genotype.isHomVar() ) - continue; - - final Map> alleleBins = stratifiedContexts.get(genotype.getSampleName()); - if ( alleleBins == null ) - continue; - - for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { - depth += alleleBin.getValue().size(); - } - } - - if ( depth == 0 ) - return null; - - double QD = -10.0 * vc.getLog10PError() / (double)depth; - - Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", QD)); - return map; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java index 842fde8ad..474b6b150 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RMSMappingQuality.java @@ -7,21 +7,17 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.codecs.vcf.VCFStandardHeaderLines; import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** @@ -29,25 +25,48 @@ import java.util.Map; */ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAnnotation, ActiveRegionBasedAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + int totalSize = 0, index = 0; + int qualities[]; + if (stratifiedContexts != null) { + if ( stratifiedContexts.size() == 0 ) + return null; - int totalSize = 0; - for ( AlignmentContext context : stratifiedContexts.values() ) - totalSize += context.size(); + for ( AlignmentContext context : stratifiedContexts.values() ) + totalSize += context.size(); - final int[] qualities = new int[totalSize]; - int index = 0; + qualities = new int[totalSize]; - for ( Map.Entry sample : stratifiedContexts.entrySet() ) { - AlignmentContext context = sample.getValue(); - final ReadBackedPileup pileup = context.getBasePileup(); - for (PileupElement p : pileup ) { - if ( p.getMappingQual() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) - qualities[index++] = p.getMappingQual(); + for ( Map.Entry sample : stratifiedContexts.entrySet() ) { + AlignmentContext context = sample.getValue(); + for (PileupElement p : context.getBasePileup() ) + index = fillMappingQualitiesFromPileupAndUpdateIndex(p.getRead(), index, qualities); } } + else if (perReadAlleleLikelihoodMap != null) { + if ( perReadAlleleLikelihoodMap.size() == 0 ) + return null; + + for ( PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) + totalSize += perReadLikelihoods.size(); + + qualities = new int[totalSize]; + for ( PerReadAlleleLikelihoodMap perReadLikelihoods : perReadAlleleLikelihoodMap.values() ) { + for (GATKSAMRecord read : perReadLikelihoods.getStoredElements()) + index = fillMappingQualitiesFromPileupAndUpdateIndex(read, index, qualities); + + + } + } + else + return null; + + double rms = MathUtils.rms(qualities); Map map = new HashMap(); @@ -55,32 +74,12 @@ public class RMSMappingQuality extends InfoFieldAnnotation implements StandardAn return map; } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if ( stratifiedContexts.size() == 0 ) - return null; + private static int fillMappingQualitiesFromPileupAndUpdateIndex(final GATKSAMRecord read, final int inputIdx, final int[] qualities) { + int outputIdx = inputIdx; + if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) + qualities[outputIdx++] = read.getMappingQuality(); - int depth = 0; - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final Map.Entry> alleleBin : alleleBins.entrySet() ) { - depth += alleleBin.getValue().size(); - } - } - - final int[] qualities = new int[depth]; - int index = 0; - - for ( final Map> alleleBins : stratifiedContexts.values() ) { - for ( final List reads : alleleBins.values() ) { - for ( final GATKSAMRecord read : reads ) { - if ( read.getMappingQuality() != QualityUtils.MAPPING_QUALITY_UNAVAILABLE ) - qualities[index++] = read.getMappingQuality(); - } - } - } - - final Map map = new HashMap(); - map.put(getKeyNames().get(0), String.format("%.2f", MathUtils.rms(qualities))); - return map; + return outputIdx; } public List getKeyNames() { return Arrays.asList(VCFConstants.RMS_MAPPING_QUALITY_KEY); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index bf6adcfac..0df7aff71 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -1,39 +1,41 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MannWhitneyU; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Abstract root for all RankSum based annotations */ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation { - static final double INDEL_LIKELIHOOD_THRESH = 0.1; static final boolean DEBUG = false; + private boolean useDithering = true; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0) - return null; + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { + // either stratifiedContexts or stratifiedPerReadAlleleLikelihoodMap has to be non-null final GenotypesContext genotypes = vc.getGenotypes(); if (genotypes == null || genotypes.size() == 0) @@ -42,40 +44,31 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR final ArrayList refQuals = new ArrayList(); final ArrayList altQuals = new ArrayList(); - if ( vc.isSNP() ) { - final List altAlleles = new ArrayList(); - for ( final Allele a : vc.getAlternateAlleles() ) - altAlleles.add(a.getBases()[0]); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + PerReadAlleleLikelihoodMap indelLikelihoodMap = null; + ReadBackedPileup pileup = null; - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { + + if (stratifiedContexts != null) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; - - fillQualsFromPileup(ref.getBase(), altAlleles, context.getBasePileup(), refQuals, altQuals); + if ( context != null ) + pileup = context.getBasePileup(); } - } else if ( vc.isIndel() || vc.isMixed() ) { + if (stratifiedPerReadAlleleLikelihoodMap != null ) + indelLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - for (final Genotype genotype : genotypes.iterateInSampleNameOrder()) { - final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); - if (context == null) { - continue; - } + if (indelLikelihoodMap != null && indelLikelihoodMap.isEmpty()) + indelLikelihoodMap = null; + // treat an empty likelihood map as a null reference - will simplify contract with fillQualsFromPileup + if (indelLikelihoodMap == null && pileup == null) + continue; - final ReadBackedPileup pileup = context.getBasePileup(); - if (pileup == null) - continue; - - if (IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap() == null || - IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().size() == 0) - return null; - - fillIndelQualsFromPileup(pileup, refQuals, altQuals); - } - } else + fillQualsFromPileup(vc.getAlleles(), vc.getStart(), pileup, indelLikelihoodMap, refQuals, altQuals ); + } + if (refQuals.isEmpty() && altQuals.isEmpty()) return null; - final MannWhitneyU mannWhitneyU = new MannWhitneyU(); + final MannWhitneyU mannWhitneyU = new MannWhitneyU(useDithering); for (final Double qual : altQuals) { mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); } @@ -103,50 +96,12 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR return map; } - public Map annotate(Map>> stratifiedContexts, VariantContext vc) { - if (stratifiedContexts.size() == 0) - return null; - - final GenotypesContext genotypes = vc.getGenotypes(); - if (genotypes == null || genotypes.size() == 0) - return null; - - final ArrayList refQuals = new ArrayList(); - final ArrayList altQuals = new ArrayList(); - - for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { - final Map> context = stratifiedContexts.get(genotype.getSampleName()); - if ( context == null ) - continue; - - fillQualsFromPileup(vc.getReference(), vc.getAlternateAlleles(), vc.getStart(), context, refQuals, altQuals); - } - - if ( refQuals.size() == 0 || altQuals.size() == 0 ) - return null; - - final MannWhitneyU mannWhitneyU = new MannWhitneyU(); - for (final Double qual : altQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET1); - } - for (final Double qual : refQuals) { - mannWhitneyU.add(qual, MannWhitneyU.USet.SET2); - } - - // we are testing that set1 (the alt bases) have lower quality scores than set2 (the ref bases) - final Pair testResults = mannWhitneyU.runOneSidedTest(MannWhitneyU.USet.SET1); - - final Map map = new HashMap(); - if (!Double.isNaN(testResults.first)) - map.put(getKeyNames().get(0), String.format("%.3f", testResults.first)); - return map; - } - - protected abstract void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, List altQuals); - - protected abstract void fillQualsFromPileup(final byte ref, final List alts, final ReadBackedPileup pileup, final List refQuals, final List altQuals); - - protected abstract void fillIndelQualsFromPileup(final ReadBackedPileup pileup, final List refQuals, final List altQuals); + protected abstract void fillQualsFromPileup(final List alleles, + final int refLoc, + final ReadBackedPileup readBackedPileup, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap, + final List refQuals, + final List altQuals); /** * Can the base in this pileup element be used in comparative tests between ref / alt bases? @@ -174,4 +129,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR p.getMappingQual() == QualityUtils.MAPPING_QUALITY_UNAVAILABLE || ((int) p.getQual()) < QualityUtils.MIN_USABLE_Q_SCORE); // need the unBAQed quality score here } + + /** + * Initialize the rank sum test annotation using walker and engine information. Right now this checks to see if + * engine randomization is turned off, and if so does not dither. + * @param walker + * @param toolkit + * @param headerLines + */ + public void initialize ( AnnotatorCompatible walker, GenomeAnalysisEngine toolkit, Set headerLines ) { + useDithering = ! toolkit.getArguments().disableRandomization; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index 3456041c7..d01233bb2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -5,7 +5,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; -import org.broadinstitute.sting.gatk.walkers.genotyper.IndelGenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -20,7 +20,7 @@ import java.util.*; /** * The u-based z-approximation from the Mann-Whitney Rank Sum Test for the distance from the end of the read for reads with the alternate allele; if the alternate allele is only seen near the ends of reads this is indicative of error). - * Note that the read position rank sum test can not be calculated for homozygous sites. + * Note that the read position rank sum test can not be calculated for sites without a mixture of reads showing both the reference and alternate alleles. */ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotation { @@ -32,98 +32,64 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio return Arrays.asList(new VCFInfoHeaderLine("ReadPosRankSum", 1, VCFHeaderLineType.Float, "Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias")); } - protected void fillQualsFromPileup(byte ref, List alts, ReadBackedPileup pileup, List refQuals, List altQuals) { - for (final PileupElement p : pileup) { - if (isUsableBase(p)) { - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); - final int numAlignedBases = AlignmentUtils.getNumAlignedBases(p.getRead()); - if (readPos > numAlignedBases / 2) - readPos = numAlignedBases - (readPos + 1); + protected void fillQualsFromPileup(final List allAlleles, + final int refLoc, + final ReadBackedPileup pileup, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap, + final List refQuals, final List altQuals) { + if (alleleLikelihoodMap == null) { + // use fast SNP-based version if we don't have per-read allele likelihoods + for ( final PileupElement p : pileup ) { + if ( isUsableBase(p) ) { + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); - if ( p.getBase() == ref ) - refQuals.add((double) readPos); - else if ( alts.contains(p.getBase()) ) - altQuals.add((double) readPos); - } - } - } + readPos = getFinalReadPosition(p.getRead(),readPos); - protected void fillQualsFromPileup(final Allele ref, final List alts, final int refLoc, final Map> stratifiedContext, final List refQuals, final List altQuals) { - for ( final Map.Entry> alleleBin : stratifiedContext.entrySet() ) { - final boolean matchesRef = ref.equals(alleleBin.getKey()); - final boolean matchesAlt = alts.contains(alleleBin.getKey()); - if ( !matchesRef && !matchesAlt ) - continue; - - for ( final GATKSAMRecord read : alleleBin.getValue() ) { - final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true ); - if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) - continue; - int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, false, 0, 0 ); - - final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); - if (readPos > numAlignedBases / 2) - readPos = numAlignedBases - (readPos + 1); - - if ( matchesRef ) - refQuals.add((double) readPos); - else - altQuals.add((double) readPos); - } - } - } - - protected void fillIndelQualsFromPileup(ReadBackedPileup pileup, List refQuals, List altQuals) { - // equivalent is whether indel likelihoods for reads corresponding to ref allele are more likely than reads corresponding to alt allele - // to classify a pileup element as ref or alt, we look at the likelihood associated with the allele associated to this element. - // A pileup element then has a list of pairs of form (Allele, likelihood of this allele). - // To classify a pileup element as Ref or Alt, we look at the likelihood of corresponding alleles. - // If likelihood of ref allele > highest likelihood of all alt alleles + epsilon, then this pielup element is "ref" - // otherwise if highest alt allele likelihood is > ref likelihood + epsilon, then this pileup element it "alt" - final HashMap> indelLikelihoodMap = IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(); - for (final PileupElement p : pileup) { - if (indelLikelihoodMap.containsKey(p)) { - LinkedHashMap el = indelLikelihoodMap.get(p); // retrieve likelihood information corresponding to this read - double refLikelihood = 0.0, altLikelihood = Double.NEGATIVE_INFINITY; // by design, first element in LinkedHashMap was ref allele - - for (Map.Entry a : el.entrySet()) { - if (a.getKey().isReference()) - refLikelihood = a.getValue(); - else { - double like = a.getValue(); - if (like >= altLikelihood) - altLikelihood = like; + if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { + refQuals.add((double)readPos); + } else if ( allAlleles.contains(Allele.create(p.getBase()))) { + altQuals.add((double)readPos); } } - - int readPos = getOffsetFromClippedReadStart(p.getRead(), p.getOffset()); - final int numAlignedBases = getNumAlignedBases(p.getRead()); - - if (readPos > numAlignedBases / 2) { - readPos = numAlignedBases - (readPos + 1); - } - //if (DEBUG) System.out.format("R:%s start:%d C:%s offset:%d rp:%d readPos:%d alignedB:%d\n",p.getRead().getReadName(),p.getRead().getAlignmentStart(),p.getRead().getCigarString(),p.getOffset(), rp, readPos, numAlignedBases); - - - // if event is beyond span of read just return and don't consider this element. This can happen, for example, with reads - // where soft clipping still left strings of low quality bases but these are later removed by indel-specific clipping. - // if (readPos < -1) - // return; - if (refLikelihood > (altLikelihood + INDEL_LIKELIHOOD_THRESH)) { - refQuals.add((double) readPos); - //if (DEBUG) System.out.format("REF like: %4.1f, pos: %d\n",refLikelihood,readPos); - } else if (altLikelihood > (refLikelihood + INDEL_LIKELIHOOD_THRESH)) { - altQuals.add((double) readPos); - //if (DEBUG) System.out.format("ALT like: %4.1f, pos: %d\n",refLikelihood,readPos); - - } - - } + return; + } + + for (Map.Entry> el : alleleLikelihoodMap.getLikelihoodReadMap().entrySet()) { + final GATKSAMRecord read = el.getKey(); + final int offset = ReadUtils.getReadCoordinateForReferenceCoordinate( read.getSoftStart(), read.getCigar(), refLoc, ReadUtils.ClippingTail.RIGHT_TAIL, true ); + if ( offset == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) + continue; + int readPos = AlignmentUtils.calcAlignmentByteArrayOffset( read.getCigar(), offset, false, false, 0, 0 ); + final int numAlignedBases = AlignmentUtils.getNumAlignedBasesCountingSoftClips( read ); + if (readPos > numAlignedBases / 2) + readPos = numAlignedBases - (readPos + 1); + +// int readPos = getOffsetFromClippedReadStart(el.getKey(), el.getKey().getOffset()); + // readPos = getFinalReadPosition(el.getKey().getRead(),readPos); + + final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + if (a.isNoCall()) + continue; // read is non-informative + if (a.isReference()) + refQuals.add((double)readPos); + else if (allAlleles.contains(a)) + altQuals.add((double)readPos); + } } + int getFinalReadPosition(GATKSAMRecord read, int initialReadPosition) { + final int numAlignedBases = getNumAlignedBases(read); + + int readPos = initialReadPosition; + if (initialReadPosition > numAlignedBases / 2) { + readPos = numAlignedBases - (initialReadPosition + 1); + } + return readPos; + + } int getNumClippedBasesAtStart(SAMRecord read) { // compute total number of clipped bases (soft or hard clipped) // check for hard clips (never consider these bases): diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java index 7e4d44cf2..33e895187 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SampleList.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -46,7 +47,12 @@ import java.util.Map; */ public class SampleList extends InfoFieldAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( vc.isMonomorphicInSamples() || !vc.hasGenotypes() ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java index 4d990e738..b3b0be153 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SnpEff.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -225,7 +226,12 @@ public class SnpEff extends InfoFieldAnnotation implements RodRequiringAnnotatio headerLines.add(new VCFHeaderLine(OUTPUT_VCF_HEADER_COMMAND_LINE_KEY, snpEffCommandLine.getValue())); } - public Map annotate ( RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc ) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { RodBinding snpEffRodBinding = walker.getSnpEffRodBinding(); // Get only SnpEff records that start at this locus, not merely span it: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java index af2df8e6a..8e1140af1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/SpanningDeletions.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -22,7 +23,12 @@ import java.util.Map; */ public class SpanningDeletions extends InfoFieldAnnotation implements StandardAnnotation { - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java index f220ecbd2..c72ba1c5f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TandemRepeatAnnotator.java @@ -30,6 +30,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; @@ -47,7 +48,12 @@ public class TandemRepeatAnnotator extends InfoFieldAnnotation implements Standa private static final String STR_PRESENT = "STR"; private static final String REPEAT_UNIT_KEY = "RU"; private static final String REPEATS_PER_ALLELE_KEY = "RPA"; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( !vc.isIndel()) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java index 63694d809..57b50c6e2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TechnologyComposition.java @@ -7,6 +7,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -28,7 +29,12 @@ public class TechnologyComposition extends InfoFieldAnnotation implements Experi private String n454 ="Num454"; private String nSolid = "NumSOLiD"; private String nOther = "NumOther"; - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( stratifiedContexts.size() == 0 ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java index 2e3578dcb..be7288a7e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/TransmissionDisequilibriumTest.java @@ -8,6 +8,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.RodRequiringAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineCount; import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType; @@ -28,7 +29,12 @@ public class TransmissionDisequilibriumTest extends InfoFieldAnnotation implemen private Set trios = null; private final static int MIN_NUM_VALID_TRIOS = 5; // don't calculate this population-level statistic if there are less than X trios with full genotype likelihood information - public Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( trios == null ) { if ( walker instanceof VariantAnnotator ) { trios = ((VariantAnnotator) walker).getSampleDB().getChildrenWithParents(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index cce106210..c4de9ed45 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -161,6 +161,9 @@ public class VariantAnnotator extends RodWalker implements Ann @Argument(fullName="useAllAnnotations", shortName="all", doc="Use all possible annotations (not for the faint of heart)", required=false) protected Boolean USE_ALL_ANNOTATIONS = false; + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. + */ @Argument(fullName="list", shortName="ls", doc="List the available annotations and exit") protected Boolean LIST = false; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index 073faf54e..ee4f77752 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -31,9 +31,9 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.*; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variantcontext.*; import java.util.*; @@ -178,7 +178,18 @@ public class VariantAnnotatorEngine { this.requireStrictAlleleMatch = requireStrictAlleleMatch; } - public VariantContext annotateContext(final RefMetaDataTracker tracker, final ReferenceContext ref, final Map stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + VariantContext vc) { + return annotateContext(tracker, ref, stratifiedContexts, vc, null); + } + + public VariantContext annotateContext(final RefMetaDataTracker tracker, + final ReferenceContext ref, + final Map stratifiedContexts, + VariantContext vc, + final Map perReadAlleleLikelihoodMap) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // annotate db occurrences @@ -189,7 +200,7 @@ public class VariantAnnotatorEngine { // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { - Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc); + Map annotationsFromCurrentType = annotationType.annotate(tracker, walker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap); if ( annotationsFromCurrentType != null ) infoAnnotations.putAll(annotationsFromCurrentType); } @@ -198,22 +209,28 @@ public class VariantAnnotatorEngine { VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); // annotate genotypes, creating another new VC in the process - return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc)).make(); + return builder.genotypes(annotateGenotypes(tracker, ref, stratifiedContexts, vc, perReadAlleleLikelihoodMap)).make(); } - public VariantContext annotateContext(final Map>> stratifiedContexts, VariantContext vc) { + public VariantContext annotateContext(final Map perReadAlleleLikelihoodMap, VariantContext vc) { Map infoAnnotations = new LinkedHashMap(vc.getAttributes()); // go through all the requested info annotationTypes for ( InfoFieldAnnotation annotationType : requestedInfoAnnotations ) { - Map annotationsFromCurrentType = ((ActiveRegionBasedAnnotation)annotationType).annotate(stratifiedContexts, vc); + if ( !(annotationType instanceof ActiveRegionBasedAnnotation) ) + continue; + + Map annotationsFromCurrentType = annotationType.annotate(perReadAlleleLikelihoodMap, vc); if ( annotationsFromCurrentType != null ) { infoAnnotations.putAll(annotationsFromCurrentType); } } // generate a new annotated VC - return new VariantContextBuilder(vc).attributes(infoAnnotations).make(); + VariantContextBuilder builder = new VariantContextBuilder(vc).attributes(infoAnnotations); + + // annotate genotypes, creating another new VC in the process + return builder.genotypes(annotateGenotypes(null, null, null, vc, perReadAlleleLikelihoodMap)).make(); } private VariantContext annotateDBs(RefMetaDataTracker tracker, ReferenceContext ref, VariantContext vc, Map infoAnnotations) { @@ -266,23 +283,29 @@ public class VariantAnnotatorEngine { } } - private GenotypesContext annotateGenotypes(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, VariantContext vc) { + + private GenotypesContext annotateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext ref, final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap) { if ( requestedGenotypeAnnotations.isEmpty() ) return vc.getGenotypes(); final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); for ( final Genotype genotype : vc.getGenotypes() ) { - AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); + AlignmentContext context = null; + PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = null; + if (stratifiedContexts != null) + context = stratifiedContexts.get(genotype.getSampleName()); + if (stratifiedPerReadAlleleLikelihoodMap != null) + perReadAlleleLikelihoodMap = stratifiedPerReadAlleleLikelihoodMap.get(genotype.getSampleName()); - if ( context == null ) { - genotypes.add(genotype); - } else { - final GenotypeBuilder gb = new GenotypeBuilder(genotype); - for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { - annotation.annotate(tracker, walker, ref, context, vc, genotype, gb); - } - genotypes.add(gb.make()); + + final GenotypeBuilder gb = new GenotypeBuilder(genotype); + for ( final GenotypeAnnotation annotation : requestedGenotypeAnnotations ) { + annotation.annotate(tracker, walker, ref, context, vc, genotype, gb, perReadAlleleLikelihoodMap); } + genotypes.add(gb.make()); } return genotypes; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java index de61c7741..03fcba760 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/ActiveRegionBasedAnnotation.java @@ -1,8 +1,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.util.List; @@ -10,8 +9,8 @@ import java.util.Map; // TODO -- make this an abstract class when we move away from InfoFieldAnnotation public interface ActiveRegionBasedAnnotation extends AnnotationType { - // return annotations for the given contexts split by sample and then allele - public abstract Map annotate(final Map>> stratifiedContexts, final VariantContext vc); + // return annotations for the given contexts split by sample and then read likelihood + public abstract Map annotate(final Map stratifiedContexts, final VariantContext vc); // return the descriptions used for the VCF INFO meta field public abstract List getDescriptions(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java index bc20f6c97..6970908b5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/GenotypeAnnotation.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFFormatHeaderLine; import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; @@ -13,9 +14,14 @@ import java.util.List; public abstract class GenotypeAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts/genotype split by sample - public abstract void annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, - ReferenceContext ref, AlignmentContext stratifiedContext, - VariantContext vc, Genotype g, GenotypeBuilder gb ); + public abstract void annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final AlignmentContext stratifiedContext, + final VariantContext vc, + final Genotype g, + final GenotypeBuilder gb, + final PerReadAlleleLikelihoodMap alleleLikelihoodMap); // return the descriptions used for the VCF FORMAT meta field public abstract List getDescriptions(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java index 1569a605f..5b2dc310d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/interfaces/InfoFieldAnnotation.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.annotator.interfaces; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -11,8 +12,25 @@ import java.util.Map; public abstract class InfoFieldAnnotation extends VariantAnnotatorAnnotation { // return annotations for the given contexts split by sample - public abstract Map annotate(RefMetaDataTracker tracker, AnnotatorCompatible walker, - ReferenceContext ref, Map stratifiedContexts, VariantContext vc); + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc) { + return annotate(tracker, walker, ref, stratifiedContexts, vc, null); + } + + public Map annotate(Map perReadAlleleLikelihoodMap, VariantContext vc) { + return annotate(null, null, null, null, vc, perReadAlleleLikelihoodMap); + } + + + public abstract Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map stratifiedPerReadAlleleLikelihoodMap); // return the descriptions used for the VCF INFO meta field public abstract List getDescriptions(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java index a6d82d5b3..dbb628135 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGatherer.java @@ -49,7 +49,6 @@ public class BQSRGatherer extends Gatherer { @Override public void gather(List inputs, File output) { - RecalibrationReport generalReport = null; final PrintStream outputFile; try { outputFile = new PrintStream(output); @@ -57,6 +56,7 @@ public class BQSRGatherer extends Gatherer { throw new UserException.MissingArgument("output", MISSING_OUTPUT_FILE); } + RecalibrationReport generalReport = null; for (File input : inputs) { final RecalibrationReport inputReport = new RecalibrationReport(input); if (generalReport == null) @@ -70,14 +70,15 @@ public class BQSRGatherer extends Gatherer { generalReport.calculateQuantizedQualities(); RecalibrationArgumentCollection RAC = generalReport.getRAC(); - if (RAC.recalibrationReport != null && !RAC.NO_PLOTS) { - final File recal_out = new File(output.getName() + ".original"); - final RecalibrationReport originalReport = new RecalibrationReport(RAC.recalibrationReport); - RecalUtils.generateRecalibrationPlot(recal_out, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); - } - else if (!RAC.NO_PLOTS) { - final File recal_out = new File(output.getName() + ".recal"); - RecalUtils.generateRecalibrationPlot(recal_out, generalReport.getRecalibrationTables(), generalReport.getCovariates(), RAC.KEEP_INTERMEDIATE_FILES); + if ( RAC.RECAL_PDF_FILE != null ) { + RAC.RECAL_TABLE_FILE = output; + if ( RAC.existingRecalibrationReport != null ) { + final RecalibrationReport originalReport = new RecalibrationReport(RAC.existingRecalibrationReport); + RecalUtils.generateRecalibrationPlot(RAC, originalReport.getRecalibrationTables(), generalReport.getRecalibrationTables(), generalReport.getCovariates()); + } + else { + RecalUtils.generateRecalibrationPlot(RAC, generalReport.getRecalibrationTables(), generalReport.getCovariates()); + } } generalReport.output(outputFile); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index f61fdda60..9506510a9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -25,38 +25,44 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import net.sf.samtools.CigarElement; import net.sf.samtools.SAMFileHeader; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.Advanced; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.ArgumentCollection; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; -import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; +import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.recalibration.QuantizationInfo; -import org.broadinstitute.sting.utils.recalibration.RecalUtils; -import org.broadinstitute.sting.utils.recalibration.RecalibrationReport; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.recalibration.*; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; import java.io.PrintStream; import java.lang.reflect.Constructor; import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; /** - * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as reported quality score, cycle, and dinucleotide). + * First pass of the base quality score recalibration -- Generates recalibration table based on various user-specified covariates (such as read group, reported quality score, machine cycle, and nucleotide context). * *

* This walker is designed to work as the first pass in a two-pass processing step. It does a by-locus traversal operating @@ -103,32 +109,34 @@ import java.util.ArrayList; * */ -@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) -@By(DataSource.READS) -@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class}) // only look at covered loci, not every loci of the reference file -@Requires({DataSource.READS, DataSource.REFERENCE}) // filter out all reads with zero or unavailable mapping quality -@PartitionBy(PartitionType.LOCUS) // this walker requires both -I input.bam and -R reference.fasta -public class BaseRecalibrator extends LocusWalker implements TreeReducible { +@DocumentedGATKFeature(groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class}) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) +@ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) +@PartitionBy(PartitionType.READ) +public class BaseRecalibrator extends ReadWalker implements NanoSchedulable { @ArgumentCollection - private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates + private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates + + @Advanced + @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) + public double BAQGOP = BAQ.DEFAULT_GOP; + + private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization - private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization - private RecalibrationTables recalibrationTables; - private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) + private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) private RecalibrationEngine recalibrationEngine; private int minimumQToUse; - protected static final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped. - protected static final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed. - protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ + protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; + private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector + private IndexedFastaSequenceFile referenceReader; // fasta reference reader for use with BAQ calculation /** * Parse the -cov arguments and create a list of covariates to be used here @@ -136,9 +144,7 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed */ public void initialize() { - // TODO -- remove me after the 2.1 release - if ( getToolkit().getArguments().numberOfThreads > 1 ) - throw new UserException("We have temporarily disabled the ability to run BaseRecalibrator multi-threaded for performance reasons. We hope to have this fixed for the next GATK release (2.2) and apologize for the inconvenience."); + baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty // check for unsupported access if (getToolkit().isGATKLite() && !getToolkit().getArguments().disableIndelQuals) @@ -147,16 +153,16 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed if (RAC.FORCE_PLATFORM != null) RAC.DEFAULT_PLATFORM = RAC.FORCE_PLATFORM; - if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified + if (RAC.knownSites.isEmpty() && !RAC.RUN_WITHOUT_DBSNP) // Warn the user if no dbSNP file or other variant mask was specified throw new UserException.CommandLineException(NO_DBSNP_EXCEPTION); if (RAC.LIST_ONLY) { RecalUtils.listAvailableCovariates(logger); System.exit(0); } - RAC.recalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table + RAC.existingRecalibrationReport = getToolkit().getArguments().BQSR_RECAL_FILE; // if we have a recalibration file, record it so it goes on the report table - Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates ArrayList requiredCovariates = covariates.getFirst(); ArrayList optionalCovariates = covariates.getSecond(); @@ -168,27 +174,41 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed requestedCovariates[covariateIndex++] = covariate; logger.info("The covariates being used here: "); - for (Covariate cov : requestedCovariates) { // list all the covariates being used + for (Covariate cov : requestedCovariates) { // list all the covariates being used logger.info("\t" + cov.getClass().getSimpleName()); - cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + } + + try { + RAC.RECAL_TABLE = new PrintStream(RAC.RECAL_TABLE_FILE); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e); } int numReadGroups = 0; for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) numReadGroups += header.getReadGroups().size(); - recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups); + recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); recalibrationEngine = initializeRecalibrationEngine(); recalibrationEngine.initialize(requestedCovariates, recalibrationTables); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; + + try { + // fasta reference reader for use with BAQ calculation + referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile); + } catch( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e); + } + } private RecalibrationEngine initializeRecalibrationEngine() { final Class recalibrationEngineClass = GATKLiteUtils.getProtectedClassIfAvailable(RecalibrationEngine.class); try { - Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null); + final Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null); constructor.setAccessible(true); return (RecalibrationEngine)constructor.newInstance(); } @@ -197,56 +217,207 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed } } - private boolean readHasBeenSkipped( final GATKSAMRecord read ) { - return read.containsTemporaryAttribute(SKIP_RECORD_ATTRIBUTE); - } - - private boolean isLowQualityBase( final PileupElement p ) { - return p.getQual() < minimumQToUse; - } - - private boolean readNotSeen( final GATKSAMRecord read ) { - return !read.containsTemporaryAttribute(SEEN_ATTRIBUTE); + private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) { + return read.getBaseQualities()[offset] < minimumQToUse; } /** * For each read at this locus get the various covariate values and increment that location in the map based on * whether or not the base matches the reference at this particular location - * - * @param tracker the reference metadata tracker - * @param ref the reference context - * @param context the alignment context - * @return returns 1, but this value isn't used in the reduce step */ - public Long map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - long countedSites = 0L; - if (tracker.getValues(RAC.knownSites).size() == 0) { // Only analyze sites not present in the provided known sites - for (final PileupElement p : context.getBasePileup()) { - final GATKSAMRecord read = p.getRead(); - final int offset = p.getOffset(); + public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) { - if (readHasBeenSkipped(read) || p.isInsertionAtBeginningOfRead() || isLowQualityBase(p) ) // This read has been marked to be skipped or base is low quality (we don't recalibrate low quality bases) - continue; + final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead); + if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it - if (readNotSeen(read)) { - read.setTemporaryAttribute(SEEN_ATTRIBUTE, true); - RecalUtils.parsePlatformForRead(read, RAC); - if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { - read.setTemporaryAttribute(SKIP_RECORD_ATTRIBUTE, true); - continue; + RecalUtils.parsePlatformForRead(read, RAC); + if (!RecalUtils.isColorSpaceConsistent(RAC.SOLID_NOCALL_STRATEGY, read)) { // parse the solid color space and check for color no-calls + return 0L; // skip this read completely + } + read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates)); + + final boolean[] skip = calculateSkipArray(read, metaDataTracker); // skip known sites of variation as well as low quality and non-regular bases + final int[] isSNP = calculateIsSNP(read, ref, originalRead); + final int[] isInsertion = calculateIsIndel(read, EventType.BASE_INSERTION); + final int[] isDeletion = calculateIsIndel(read, EventType.BASE_DELETION); + final byte[] baqArray = calculateBAQArray(read); + + if( baqArray != null ) { // some reads just can't be BAQ'ed + final double[] snpErrors = calculateFractionalErrorArray(isSNP, baqArray); + final double[] insertionErrors = calculateFractionalErrorArray(isInsertion, baqArray); + final double[] deletionErrors = calculateFractionalErrorArray(isDeletion, baqArray); + recalibrationEngine.updateDataForRead(read, skip, snpErrors, insertionErrors, deletionErrors); + return 1L; + } else { + return 0L; + } + } + + protected boolean[] calculateSkipArray( final GATKSAMRecord read, final RefMetaDataTracker metaDataTracker ) { + final byte[] bases = read.getReadBases(); + final boolean[] skip = new boolean[bases.length]; + final boolean[] knownSites = calculateKnownSites(read, metaDataTracker.getValues(RAC.knownSites)); + for( int iii = 0; iii < bases.length; iii++ ) { + skip[iii] = !BaseUtils.isRegularBase(bases[iii]) || isLowQualityBase(read, iii) || knownSites[iii] || badSolidOffset(read, iii); + } + return skip; + } + + protected boolean badSolidOffset( final GATKSAMRecord read, final int offset ) { + return ReadUtils.isSOLiDRead(read) && RAC.SOLID_RECAL_MODE != RecalUtils.SOLID_RECAL_MODE.DO_NOTHING && !RecalUtils.isColorSpaceConsistent(read, offset); + } + + protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) { + final int BUFFER_SIZE = 0; + final int readLength = read.getReadBases().length; + final boolean[] knownSites = new boolean[readLength]; + Arrays.fill(knownSites, false); + for( final Feature f : features ) { + int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here? + if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; } + int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true); + if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; } + Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true); + } + return knownSites; + } + + // BUGBUG: can be merged with calculateIsIndel + protected static int[] calculateIsSNP( final GATKSAMRecord read, final ReferenceContext ref, final GATKSAMRecord originalRead ) { + final byte[] readBases = read.getReadBases(); + final byte[] refBases = Arrays.copyOfRange(ref.getBases(), read.getAlignmentStart() - originalRead.getAlignmentStart(), ref.getBases().length + read.getAlignmentEnd() - originalRead.getAlignmentEnd()); + final int[] snp = new int[readBases.length]; + int readPos = 0; + int refPos = 0; + for ( final CigarElement ce : read.getCigar().getCigarElements() ) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + for( int iii = 0; iii < elementLength; iii++ ) { + snp[readPos] = ( BaseUtils.basesAreEqual(readBases[readPos], refBases[refPos]) ? 0 : 1 ); + readPos++; + refPos++; } - read.setTemporaryAttribute(COVARS_ATTRIBUTE, RecalUtils.computeCovariates(read, requestedCovariates)); - } - - if (!ReadUtils.isSOLiDRead(read) || // SOLID bams have inserted the reference base into the read if the color space in inconsistent with the read base so skip it - RAC.SOLID_RECAL_MODE == RecalUtils.SOLID_RECAL_MODE.DO_NOTHING || - RecalUtils.isColorSpaceConsistent(read, offset)) - recalibrationEngine.updateDataForPileupElement(p, ref.getBase()); // This base finally passed all the checks for a good base, so add it to the big data hashmap + break; + case D: + case N: + refPos += elementLength; + break; + case I: + case S: // ReferenceContext doesn't have the soft clipped bases! + readPos += elementLength; + break; + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); } - countedSites++; + } + return snp; + } + + protected static int[] calculateIsIndel( final GATKSAMRecord read, final EventType mode ) { + final byte[] readBases = read.getReadBases(); + final int[] indel = new int[readBases.length]; + Arrays.fill(indel, 0); + int readPos = 0; + for ( final CigarElement ce : read.getCigar().getCigarElements() ) { + final int elementLength = ce.getLength(); + switch (ce.getOperator()) { + case M: + case EQ: + case X: + case S: + { + readPos += elementLength; + break; + } + case D: + { + final int index = ( read.getReadNegativeStrandFlag() ? readPos : ( readPos > 0 ? readPos - 1 : readPos ) ); + indel[index] = ( mode.equals(EventType.BASE_DELETION) ? 1 : 0 ); + break; + } + case I: + { + final boolean forwardStrandRead = !read.getReadNegativeStrandFlag(); + if( forwardStrandRead ) { + indel[(readPos > 0 ? readPos - 1 : readPos)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 ); + } + for (int iii = 0; iii < elementLength; iii++) { + readPos++; + } + if( !forwardStrandRead ) { + indel[(readPos < indel.length ? readPos : readPos - 1)] = ( mode.equals(EventType.BASE_INSERTION) ? 1 : 0 ); + } + break; + } + case N: + case H: + case P: + break; + default: + throw new ReviewedStingException("Unsupported cigar operator: " + ce.getOperator()); + } + } + return indel; + } + + protected static double[] calculateFractionalErrorArray( final int[] errorArray, final byte[] baqArray ) { + if(errorArray.length != baqArray.length ) { + throw new ReviewedStingException("Array length mismatch detected. Malformed read?"); } - return countedSites; + final byte NO_BAQ_UNCERTAINTY = (byte)'@'; + final int BLOCK_START_UNSET = -1; + + final double[] fractionalErrors = new double[baqArray.length]; + Arrays.fill(fractionalErrors, 0.0); + boolean inBlock = false; + int blockStartIndex = BLOCK_START_UNSET; + int iii; + for( iii = 0; iii < fractionalErrors.length; iii++ ) { + if( baqArray[iii] == NO_BAQ_UNCERTAINTY ) { + if( !inBlock ) { + fractionalErrors[iii] = (double) errorArray[iii]; + } else { + calculateAndStoreErrorsInBlock(iii, blockStartIndex, errorArray, fractionalErrors); + inBlock = false; // reset state variables + blockStartIndex = BLOCK_START_UNSET; // reset state variables + } + } else { + inBlock = true; + if( blockStartIndex == BLOCK_START_UNSET ) { blockStartIndex = iii; } + } + } + if( inBlock ) { + calculateAndStoreErrorsInBlock(iii-1, blockStartIndex, errorArray, fractionalErrors); + } + if( fractionalErrors.length != errorArray.length ) { + throw new ReviewedStingException("Output array length mismatch detected. Malformed read?"); + } + return fractionalErrors; + } + + private static void calculateAndStoreErrorsInBlock( final int iii, + final int blockStartIndex, + final int[] errorArray, + final double[] fractionalErrors ) { + int totalErrors = 0; + for( int jjj = Math.max(0,blockStartIndex-1); jjj <= iii; jjj++ ) { + totalErrors += errorArray[jjj]; + } + for( int jjj = Math.max(0, blockStartIndex-1); jjj <= iii; jjj++ ) { + fractionalErrors[jjj] = ((double) totalErrors) / ((double)(iii - Math.max(0,blockStartIndex-1) + 1)); + } + } + + private byte[] calculateBAQArray( final GATKSAMRecord read ) { + baq.baqRead(read, referenceReader, BAQ.CalculationMode.RECALCULATE, BAQ.QualityMode.ADD_TAG); + return BAQ.getBAQTag(read); } /** @@ -270,11 +441,6 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed return sum; } - public Long treeReduce(Long sum1, Long sum2) { - sum1 += sum2; - return sum1; - } - @Override public void onTraversalDone(Long result) { logger.info("Calculating quantized quality scores..."); @@ -284,25 +450,24 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed generateReport(); logger.info("...done!"); - if (!RAC.NO_PLOTS) { + if ( RAC.RECAL_PDF_FILE != null ) { logger.info("Generating recalibration plots..."); generatePlots(); } - logger.info("Processed: " + result + " sites"); + logger.info("Processed: " + result + " reads"); } private void generatePlots() { File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE; if (recalFile != null) { RecalibrationReport report = new RecalibrationReport(recalFile); - RecalUtils.generateRecalibrationPlot(RAC.RECAL_FILE, report.getRecalibrationTables(), recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), recalibrationTables, requestedCovariates); } else - RecalUtils.generateRecalibrationPlot(RAC.RECAL_FILE, recalibrationTables, requestedCovariates, RAC.KEEP_INTERMEDIATE_FILES); + RecalUtils.generateRecalibrationPlot(RAC, recalibrationTables, requestedCovariates); } - /** * go through the quality score table and use the # observations and the empirical quality score * to build a quality score histogram for quantization. Then use the QuantizeQual algorithm to @@ -313,14 +478,6 @@ public class BaseRecalibrator extends LocusWalker implements TreeRed } private void generateReport() { - PrintStream output; - try { - output = new PrintStream(RAC.RECAL_FILE); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_FILE, "could not be created"); - } - - RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates, output); + RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates); } -} - +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index f4b00925e..fc7d8a8a4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -28,10 +28,10 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broad.tribble.Feature; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.recalibration.RecalUtils; import java.io.File; +import java.io.PrintStream; import java.util.Collections; import java.util.List; @@ -59,14 +59,29 @@ public class RecalibrationArgumentCollection { * After the header, data records occur one per line until the end of the file. The first several items on a line are the * values of the individual covariates and will change depending on which covariates were specified at runtime. The last * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, - * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. Use '/dev/stdout' to print to standard out. */ @Gather(BQSRGatherer.class) - @Output - public File RECAL_FILE; + @Output(doc = "The output recalibration table file to create", required = true) + public File RECAL_TABLE_FILE = null; + public PrintStream RECAL_TABLE; /** - * List all implemented covariates. + * If not provided, then no plots will be generated (useful for queue scatter/gathering). + * However, we *highly* recommend that users generate these plots whenever possible for QC checking. + */ + @Output(fullName = "plot_pdf_file", shortName = "plots", doc = "The output recalibration pdf file to create", required = false) + public File RECAL_PDF_FILE = null; + + /** + * If not provided, then a temporary file is created and then deleted upon completion. + */ + @Hidden + @Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false) + public File RECAL_CSV_FILE = null; + + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. */ @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) public boolean LIST_ONLY = false; @@ -166,14 +181,12 @@ public class RecalibrationArgumentCollection { @Hidden @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; - @Hidden - @Argument(fullName = "keep_intermediate_files", shortName = "k", required = false, doc ="does not remove the temporary csv file created to generate the plots") - public boolean KEEP_INTERMEDIATE_FILES = false; - @Hidden - @Argument(fullName = "no_plots", shortName = "np", required = false, doc = "does not generate any plots -- useful for queue scatter/gathering") - public boolean NO_PLOTS = false; - public File recalibrationReport = null; + @Hidden + @Output(fullName = "recal_table_update_log", shortName = "recal_table_update_log", required = false, doc = "If provided, log all updates to the recalibration tables to the given file. For debugging/testing purposes only") + public PrintStream RECAL_TABLE_UPDATE_LOG = null; + + public File existingRecalibrationReport = null; public GATKReportTable generateReportTable(final String covariateNames) { GATKReportTable argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); @@ -205,12 +218,10 @@ public class RecalibrationArgumentCollection { argumentsTable.set("force_platform", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, FORCE_PLATFORM); argumentsTable.addRowID("quantizing_levels", true); argumentsTable.set("quantizing_levels", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, QUANTIZING_LEVELS); - argumentsTable.addRowID("keep_intermediate_files", true); - argumentsTable.set("keep_intermediate_files", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, KEEP_INTERMEDIATE_FILES); - argumentsTable.addRowID("no_plots", true); - argumentsTable.set("no_plots", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, NO_PLOTS); argumentsTable.addRowID("recalibration_report", true); - argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, recalibrationReport == null ? "null" : recalibrationReport.getAbsolutePath()); + argumentsTable.set("recalibration_report", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, existingRecalibrationReport == null ? "null" : existingRecalibrationReport.getAbsolutePath()); + argumentsTable.addRowID("plot_pdf_file", true); + argumentsTable.set("plot_pdf_file", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, RECAL_PDF_FILE == null ? "null" : RECAL_PDF_FILE.getAbsolutePath()); argumentsTable.addRowID("binary_tag_name", true); argumentsTable.set("binary_tag_name", RecalUtils.ARGUMENT_VALUE_COLUMN_NAME, BINARY_TAG_NAME == null ? "null" : BINARY_TAG_NAME); return argumentsTable; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 38e306939..962d62d5e 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; /* * Copyright (c) 2009 The Broad Institute @@ -32,6 +33,5 @@ public interface RecalibrationEngine { public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables); - public void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase); - + public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index 08c7da754..6031aa955 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -42,52 +42,33 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP protected RecalibrationTables recalibrationTables; public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { - this.covariates = covariates; + this.covariates = covariates.clone(); this.recalibrationTables = recalibrationTables; } - /** - * Loop through the list of requested covariates and pick out the value from the read, offset, and reference - * Using the list of covariate values as a key, pick out the RecalDatum and increment, - * adding one to the number of observations and potentially one to the number of mismatches for mismatches only. - * - * @param pileupElement The pileup element to update - * @param refBase The reference base at this locus - */ - public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) { - final int offset = pileupElement.getOffset(); - final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead()); + @Override + public void updateDataForRead( final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) { + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { + if( !skip[offset] ) { + final ReadCovariates readCovariates = covariateKeySetFrom(read); - final byte qual = pileupElement.getQual(); - final boolean isError = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase); + final byte qual = read.getBaseQualities()[offset]; + final double isError = snpErrors[offset]; - final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION); - final int eventIndex = EventType.BASE_SUBSTITUTION.index; + final int[] keys = readCovariates.getKeySet(offset, EventType.BASE_SUBSTITUTION); + final int eventIndex = EventType.BASE_SUBSTITUTION.index; - final NestedIntegerArray rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); - final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); - final RecalDatum rgThisDatum = createDatumObject(qual, isError); - if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it - rgRecalTable.put(rgThisDatum, keys[0], eventIndex); - else - rgPreviousDatum.combine(rgThisDatum); + combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex); - final NestedIntegerArray qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); - final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex); - if (qualPreviousDatum == null) - qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex); - else - qualPreviousDatum.increment(isError); + incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex); - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - final NestedIntegerArray covRecalTable = recalibrationTables.getTable(i); - final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); - if (covPreviousDatum == null) - covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); - else - covPreviousDatum.increment(isError); + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + + incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + } + } } } @@ -98,8 +79,8 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP * @param isError whether or not the observation is an error * @return a new RecalDatum object with the observation and the error */ - protected RecalDatum createDatumObject(final byte reportedQual, final boolean isError) { - return new RecalDatum(1, isError ? 1:0, reportedQual); + protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { + return new RecalDatum(1, isError, reportedQual); } /** @@ -111,4 +92,63 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP protected ReadCovariates covariateKeySetFrom(GATKSAMRecord read) { return (ReadCovariates) read.getTemporaryAttribute(BaseRecalibrator.COVARS_ATTRIBUTE); } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final double isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1.0, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1.0, isError); + } + } + + /** + * Combines the RecalDatum at the specified position in the specified table with a new RecalDatum, or put a + * new item there if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + protected void combineDatumOrPutIfNecessary( final NestedIntegerArray table, final byte qual, final double isError, final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + final RecalDatum newDatum = createDatumObject(qual, isError); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(newDatum, keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and combine it with our item (item is guaranteed to exist at this point) + table.get(keys).combine(newDatum); + } + } + else { + // Easy case: already an item here, so combine it with our item + existingDatum.combine(newDatum); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index c5b043b7a..44b0d74ca 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -30,7 +30,7 @@ import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java index 9289f86e3..058056c70 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -29,7 +29,7 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -140,7 +140,7 @@ public class ReadGroupProperties extends ReadWalker { } @Override - public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) { + public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, RefMetaDataTracker RefMetaDataTracker) { final String rgID = read.getReadGroup().getId(); final PerReadGroupInfo info = readGroupInfo.get(rgID); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java index 1dc8a7ec1..2b84cccc9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadLengthDistribution.java @@ -4,7 +4,7 @@ import net.sf.samtools.SAMReadGroupRecord; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -74,7 +74,7 @@ public class ReadLengthDistribution extends ReadWalker { } @Override - public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, ReadMetaDataTracker readMetaDataTracker) { + public Integer map(ReferenceContext referenceContext, GATKSAMRecord samRecord, RefMetaDataTracker RefMetaDataTracker) { GATKReportTable table = report.getTable("ReadLengthDistribution"); int length = Math.abs(samRecord.getReadLength()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java index 112eb278e..cbd3bc950 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/targets/DiagnoseTargets.java @@ -246,6 +246,14 @@ public class DiagnoseTargets extends LocusWalker { */ private void addNewOverlappingIntervals(GenomeLoc refLocus) { GenomeLoc interval = intervalListIterator.peek(); + + // skip any intervals with no coverage that we have passed + while (interval != null && interval.isBefore(refLocus)) { + intervalListIterator.next(); // discard the interval (we've already added it to the map) + interval = intervalListIterator.peek(); + } + + // add any intervals that overlap this one while (interval != null && !interval.isPast(refLocus)) { intervalMap.put(interval, createIntervalStatistic(interval)); intervalListIterator.next(); // discard the interval (we've already added it to the map) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java similarity index 98% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index 8fbd37e30..2b9744b89 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReference.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -76,10 +76,9 @@ import java.util.List; * */ @DocumentedGATKFeature( groupName = "Companion Utilities", extraDocs = {CommandLineGATK.class} ) -@WalkerName("FastaAlternateReferenceMaker") @Reference(window=@Window(start=-1,stop=50)) @Requires(value={DataSource.REFERENCE}) -public class FastaAlternateReference extends FastaReference { +public class FastaAlternateReferenceMaker extends FastaReferenceMaker { /** * Variants from these input files are used by this tool to construct an alternate reference. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReference.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java similarity index 96% rename from public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReference.java rename to public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java index a835560d4..362867318 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReference.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaReferenceMaker.java @@ -62,15 +62,14 @@ import java.io.PrintStream; *

  * java -Xmx2g -jar GenomeAnalysisTK.jar \
  *   -R ref.fasta \
- *   -T FastaReference \
+ *   -T FastaReferenceMaker \
  *   -o output.fasta \
  *   -L input.intervals
  * 
* */ @DocumentedGATKFeature( groupName = "Companion Utilities", extraDocs = {CommandLineGATK.class} ) -@WalkerName("FastaReferenceMaker") -public class FastaReference extends RefWalker, GenomeLoc> { +public class FastaReferenceMaker extends RefWalker, GenomeLoc> { @Output PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java deleted file mode 100755 index 08a333486..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationModel.java +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; - -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - - -/** - * The model representing how we calculate a genotype given the priors and a pile - * of bases and quality scores - */ -public abstract class AlleleFrequencyCalculationModel implements Cloneable { - - public enum Model { - /** The default model with the best performance in all cases */ - EXACT - } - - protected int N; - protected int MAX_ALTERNATE_ALLELES_TO_GENOTYPE; - protected boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; - - protected Logger logger; - protected PrintStream verboseWriter; - - protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; - - protected AlleleFrequencyCalculationModel(final UnifiedArgumentCollection UAC, final int N, final Logger logger, final PrintStream verboseWriter) { - this.N = N; - this.MAX_ALTERNATE_ALLELES_TO_GENOTYPE = UAC.MAX_ALTERNATE_ALLELES; - this.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = UAC.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; - this.logger = logger; - this.verboseWriter = verboseWriter; - } - - /** - * Wrapper class that compares two likelihoods associated with two alleles - */ - protected static final class LikelihoodSum implements Comparable { - public double sum = 0.0; - public Allele allele; - - public LikelihoodSum(Allele allele) { this.allele = allele; } - - public int compareTo(LikelihoodSum other) { - final double diff = sum - other.sum; - return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; - } - } - - /** - * Unpack GenotypesContext into arraylist of doubel values - * @param GLs Input genotype context - * @return ArrayList of doubles corresponding to GL vectors - */ - protected static ArrayList getGLs(GenotypesContext GLs) { - ArrayList genotypeLikelihoods = new ArrayList(GLs.size()); - - genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy - for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { - if ( sample.hasLikelihoods() ) { - double[] gls = sample.getLikelihoods().getAsVector(); - - if ( MathUtils.sum(gls) < UnifiedGenotyperEngine.SUM_GL_THRESH_NOCALL ) - genotypeLikelihoods.add(gls); - } - } - - return genotypeLikelihoods; - } - - /** - * Must be overridden by concrete subclasses - * @param vc variant context with alleles and genotype likelihoods - * @param log10AlleleFrequencyPriors priors - * @param result (pre-allocated) object to store likelihoods results - * @return the alleles used for genotyping - */ - protected abstract List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result); - - /** - * Must be overridden by concrete subclasses - * @param vc variant context with alleles and genotype likelihoods - * @param allelesToUse alleles to subset - * @param assignGenotypes - * @param ploidy - * @return GenotypesContext object - */ - protected abstract GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy); - - - // ------------------------------------------------------------------------------------- - // - // protected classes used to store exact model matrix columns - // - // ------------------------------------------------------------------------------------- - - protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first - - // a wrapper around the int array so that we can make it hashable - protected static final class ExactACcounts { - - protected final int[] counts; - private int hashcode = -1; - - public ExactACcounts(final int[] counts) { - this.counts = counts; - } - - public int[] getCounts() { - return counts; - } - - @Override - public boolean equals(Object obj) { - return (obj instanceof ExactACcounts) && Arrays.equals(counts, ((ExactACcounts) obj).counts); - } - - @Override - public int hashCode() { - if ( hashcode == -1 ) - hashcode = Arrays.hashCode(counts); - return hashcode; - } - - @Override - public String toString() { - StringBuffer sb = new StringBuffer(); - sb.append(counts[0]); - for ( int i = 1; i < counts.length; i++ ) { - sb.append("/"); - sb.append(counts[i]); - } - return sb.toString(); - } - } - - // This class represents a column in the Exact AC calculation matrix - protected static final class ExactACset { - - // the counts of the various alternate alleles which this column represents - final ExactACcounts ACcounts; - - // the column of the matrix - final double[] log10Likelihoods; - - int sum = -1; - - public ExactACset(final int size, final ExactACcounts ACcounts) { - this.ACcounts = ACcounts; - log10Likelihoods = new double[size]; - Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY); - } - - // sum of all the non-reference alleles - public int getACsum() { - if ( sum == -1 ) { - sum = 0; - for ( int count : ACcounts.getCounts() ) - sum += count; - } - return sum; - } - - public boolean equals(Object obj) { - return (obj instanceof ExactACset) && ACcounts.equals(((ExactACset)obj).ACcounts); - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java deleted file mode 100644 index c93e780bf..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/AlleleFrequencyCalculationResult.java +++ /dev/null @@ -1,150 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.utils.MathUtils; - -import java.util.Arrays; - -/** - * Created by IntelliJ IDEA. - * User: ebanks - * Date: Dec 14, 2011 - * - * Useful helper class to communicate the results of the allele frequency calculation - */ -public class AlleleFrequencyCalculationResult { - - // These variables are intended to contain the MLE and MAP (and their corresponding allele counts) of the site over all alternate alleles - private double log10MLE; - private double log10MAP; - private final int[] alleleCountsOfMLE; - private final int[] alleleCountsOfMAP; - - // The posteriors seen, not including that of AF=0 - private static final int POSTERIORS_CACHE_SIZE = 5000; - private final double[] log10PosteriorMatrixValues = new double[POSTERIORS_CACHE_SIZE]; - private int currentPosteriorsCacheIndex = 0; - private Double log10PosteriorMatrixSum = null; - - // These variables are intended to contain the likelihood/posterior probability for the site's being monomorphic (i.e. AF=0 for all alternate alleles) - private double log10LikelihoodOfAFzero; - private double log10PosteriorOfAFzero; - - - public AlleleFrequencyCalculationResult(final int maxAltAlleles) { - alleleCountsOfMLE = new int[maxAltAlleles]; - alleleCountsOfMAP = new int[maxAltAlleles]; - reset(); - } - - public double getLog10MLE() { - return log10MLE; - } - - public double getLog10MAP() { - return log10MAP; - } - - public double getLog10PosteriorsMatrixSumWithoutAFzero() { - if ( log10PosteriorMatrixSum == null ) { - log10PosteriorMatrixSum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); - } - return log10PosteriorMatrixSum; - } - - public int[] getAlleleCountsOfMLE() { - return alleleCountsOfMLE; - } - - public int[] getAlleleCountsOfMAP() { - return alleleCountsOfMAP; - } - - public double getLog10LikelihoodOfAFzero() { - return log10LikelihoodOfAFzero; - } - - public double getLog10PosteriorOfAFzero() { - return log10PosteriorOfAFzero; - } - - public void reset() { - log10MLE = log10MAP = log10LikelihoodOfAFzero = log10PosteriorOfAFzero = AlleleFrequencyCalculationModel.VALUE_NOT_CALCULATED; - for ( int i = 0; i < alleleCountsOfMLE.length; i++ ) { - alleleCountsOfMLE[i] = 0; - alleleCountsOfMAP[i] = 0; - } - currentPosteriorsCacheIndex = 0; - log10PosteriorMatrixSum = null; - } - - public void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { - if ( log10LofK > log10MLE ) { - log10MLE = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMLE[i] = alleleCountsForK[i]; - } - } - - public void updateMAPifNeeded(final double log10LofK, final int[] alleleCountsForK) { - addToPosteriorsCache(log10LofK); - - if ( log10LofK > log10MAP ) { - log10MAP = log10LofK; - for ( int i = 0; i < alleleCountsForK.length; i++ ) - alleleCountsOfMAP[i] = alleleCountsForK[i]; - } - } - - private void addToPosteriorsCache(final double log10LofK) { - // add to the cache - log10PosteriorMatrixValues[currentPosteriorsCacheIndex++] = log10LofK; - - // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell - if ( currentPosteriorsCacheIndex == POSTERIORS_CACHE_SIZE ) { - final double temporarySum = MathUtils.log10sumLog10(log10PosteriorMatrixValues, 0, currentPosteriorsCacheIndex); - log10PosteriorMatrixValues[0] = temporarySum; - currentPosteriorsCacheIndex = 1; - } - } - - public void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { - this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; - if ( log10LikelihoodOfAFzero > log10MLE ) { - log10MLE = log10LikelihoodOfAFzero; - Arrays.fill(alleleCountsOfMLE, 0); - } - } - - public void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { - this.log10PosteriorOfAFzero = log10PosteriorOfAFzero; - if ( log10PosteriorOfAFzero > log10MAP ) { - log10MAP = log10PosteriorOfAFzero; - Arrays.fill(alleleCountsOfMAP, 0); - } - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java deleted file mode 100755 index 77a39afc2..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModel.java +++ /dev/null @@ -1,480 +0,0 @@ -/* - * Copyright (c) 2010. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.variantcontext.*; - -import java.io.PrintStream; -import java.util.*; - -public class ExactAFCalculationModel extends AlleleFrequencyCalculationModel { - - // private final static boolean DEBUG = false; - - private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 - - protected ExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { - super(UAC, N, logger, verboseWriter); - } - - public List getLog10PNonRef(final VariantContext vc, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - GenotypesContext GLs = vc.getGenotypes(); - List alleles = vc.getAlleles(); - - final int myMaxAltAllelesToGenotype = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS && vc.getType().equals(VariantContext.Type.INDEL) ? 2 : MAX_ALTERNATE_ALLELES_TO_GENOTYPE; - - // don't try to genotype too many alternate alleles - if ( vc.getAlternateAlleles().size() > myMaxAltAllelesToGenotype ) { - logger.warn("this tool is currently set to genotype at most " + myMaxAltAllelesToGenotype + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); - - alleles = new ArrayList(myMaxAltAllelesToGenotype + 1); - alleles.add(vc.getReference()); - alleles.addAll(chooseMostLikelyAlternateAlleles(vc, myMaxAltAllelesToGenotype)); - GLs = VariantContextUtils.subsetDiploidAlleles(vc, alleles, false); - } - - linearExactMultiAllelic(GLs, alleles.size() - 1, log10AlleleFrequencyPriors, result); - - return alleles; - } - - - private static final int PL_INDEX_OF_HOM_REF = 0; - private static final List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { - final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); - final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; - for ( int i = 0; i < numOriginalAltAlleles; i++ ) - likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); - - // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype - final ArrayList GLs = getGLs(vc.getGenotypes()); - for ( final double[] likelihoods : GLs ) { - final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); - if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); - if ( alleles.alleleIndex1 != 0 ) - likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; - // don't double-count it - if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) - likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; - } - } - - // sort them by probability mass and choose the best ones - Collections.sort(Arrays.asList(likelihoodSums)); - final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); - for ( int i = 0; i < numAllelesToChoose; i++ ) - bestAlleles.add(likelihoodSums[i].allele); - - final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); - for ( Allele allele : vc.getAlternateAlleles() ) { - if ( bestAlleles.contains(allele) ) - orderedBestAlleles.add(allele); - } - - return orderedBestAlleles; - } - - - // ------------------------------------------------------------------------------------- - // - // Multi-allelic implementation. - // - // ------------------------------------------------------------------------------------- - - public static void linearExactMultiAllelic(final GenotypesContext GLs, - final int numAlternateAlleles, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - final ArrayList genotypeLikelihoods = getGLs(GLs); - final int numSamples = genotypeLikelihoods.size()-1; - final int numChr = 2*numSamples; - - // queue of AC conformations to process - final LinkedList ACqueue = new LinkedList(); - - // mapping of ExactACset indexes to the objects - final HashMap indexesToACset = new HashMap(numChr+1); - - // add AC=0 to the queue - int[] zeroCounts = new int[numAlternateAlleles]; - ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); - ACqueue.add(zeroSet); - indexesToACset.put(zeroSet.ACcounts, zeroSet); - - // keep processing while we have AC conformations that need to be calculated - double maxLog10L = Double.NEGATIVE_INFINITY; - while ( !ACqueue.isEmpty() ) { - // compute log10Likelihoods - final ExactACset set = ACqueue.remove(); - final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, maxLog10L, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors, result); - - // adjust max likelihood seen if needed - maxLog10L = Math.max(maxLog10L, log10LofKs); - - // clean up memory - indexesToACset.remove(set.ACcounts); - //if ( DEBUG ) - // System.out.printf(" *** removing used set=%s%n", set.ACcounts); - } - } - - private static final class DependentSet { - public final int[] ACcounts; - public final int PLindex; - - public DependentSet(final int[] ACcounts, final int PLindex) { - this.ACcounts = ACcounts; - this.PLindex = PLindex; - } - } - - private static double calculateAlleleCountConformation(final ExactACset set, - final ArrayList genotypeLikelihoods, - final double maxLog10L, - final int numChr, - final LinkedList ACqueue, - final HashMap indexesToACset, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - //if ( DEBUG ) - // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); - - // compute the log10Likelihoods - computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors, result); - - final double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; - - // can we abort early because the log10Likelihoods are so small? - if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - //if ( DEBUG ) - // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); - return log10LofK; - } - - // iterate over higher frequencies if possible - final int ACwiggle = numChr - set.getACsum(); - if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies - return log10LofK; - - final int numAltAlleles = set.ACcounts.getCounts().length; - - // add conformations for the k+1 case - for ( int allele = 0; allele < numAltAlleles; allele++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); - ACcountsClone[allele]++; - // to get to this conformation, a sample would need to be AB (remember that ref=0) - final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); - updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - } - - // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different - if ( ACwiggle > 1 ) { - final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); - final ArrayList sameAlleles = new ArrayList(numAltAlleles); - - for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { - for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { - final int[] ACcountsClone = set.ACcounts.getCounts().clone(); - ACcountsClone[allele_i]++; - ACcountsClone[allele_j]++; - - // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) - final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); - if ( allele_i == allele_j ) - sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); - else - differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); - } - } - - // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering - for ( DependentSet dependent : differentAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - for ( DependentSet dependent : sameAlleles ) - updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); - } - - return log10LofK; - } - - // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and - // also pushes its value to the given callingSetIndex. - private static void updateACset(final int[] newSetCounts, - final int numChr, - final ExactACset dependentSet, - final int PLsetIndex, - final Queue ACqueue, - final HashMap indexesToACset, - final ArrayList genotypeLikelihoods) { - final ExactACcounts index = new ExactACcounts(newSetCounts); - if ( !indexesToACset.containsKey(index) ) { - ExactACset set = new ExactACset(numChr/2 +1, index); - indexesToACset.put(index, set); - ACqueue.add(set); - } - - // push data from the dependency to the new set - //if ( DEBUG ) - // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); - pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); - } - - private static void computeLofK(final ExactACset set, - final ArrayList genotypeLikelihoods, - final double[] log10AlleleFrequencyPriors, - final AlleleFrequencyCalculationResult result) { - - set.log10Likelihoods[0] = 0.0; // the zero case - final int totalK = set.getACsum(); - - // special case for k = 0 over all k - if ( totalK == 0 ) { - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) - set.log10Likelihoods[j] = set.log10Likelihoods[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; - - final double log10Lof0 = set.log10Likelihoods[set.log10Likelihoods.length-1]; - result.setLog10LikelihoodOfAFzero(log10Lof0); - result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); - return; - } - - // if we got here, then k > 0 for at least one k. - // the non-AA possible conformations were already dealt with by pushes from dependent sets; - // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value - for ( int j = 1; j < set.log10Likelihoods.length; j++ ) { - - if ( totalK < 2*j-1 ) { - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.log10Likelihoods[j-1] + gl[HOM_REF_INDEX]; - set.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[j], conformationValue); - } - - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - set.log10Likelihoods[j] = set.log10Likelihoods[j] - logDenominator; - } - - double log10LofK = set.log10Likelihoods[set.log10Likelihoods.length-1]; - - // update the MLE if necessary - result.updateMLEifNeeded(log10LofK, set.ACcounts.counts); - - // apply the priors over each alternate allele - for ( final int ACcount : set.ACcounts.getCounts() ) { - if ( ACcount > 0 ) - log10LofK += log10AlleleFrequencyPriors[ACcount]; - } - result.updateMAPifNeeded(log10LofK, set.ACcounts.counts); - } - - private static void pushData(final ExactACset targetSet, - final ExactACset dependentSet, - final int PLsetIndex, - final ArrayList genotypeLikelihoods) { - final int totalK = targetSet.getACsum(); - - for ( int j = 1; j < targetSet.log10Likelihoods.length; j++ ) { - - if ( totalK <= 2*j ) { // skip impossible conformations - final double[] gl = genotypeLikelihoods.get(j); - final double conformationValue = - determineCoefficient(PLsetIndex, j, targetSet.ACcounts.getCounts(), totalK) + dependentSet.log10Likelihoods[j-1] + gl[PLsetIndex]; - targetSet.log10Likelihoods[j] = MathUtils.approximateLog10SumLog10(targetSet.log10Likelihoods[j], conformationValue); - } - } - } - - private static double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { - - // the closed form representation generalized for multiple alleles is as follows: - // AA: (2j - totalK) * (2j - totalK - 1) - // AB: 2k_b * (2j - totalK) - // AC: 2k_c * (2j - totalK) - // BB: k_b * (k_b - 1) - // BC: 2 * k_b * k_c - // CC: k_c * (k_c - 1) - - // find the 2 alleles that are represented by this PL index - GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); - - // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** - // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** - - // the AX het case - if ( alleles.alleleIndex1 == 0 ) - return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; - - final int k_i = ACcounts[alleles.alleleIndex1-1]; - - // the hom var case (e.g. BB, CC, DD) - final double coeff; - if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { - coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; - } - // the het non-ref case (e.g. BC, BD, CD) - else { - final int k_j = ACcounts[alleles.alleleIndex2-1]; - coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; - } - - return coeff; - } - - public GenotypesContext subsetAlleles(final VariantContext vc, - final List allelesToUse, - final boolean assignGenotypes, - final int ploidy) { - return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); - } - - // ------------------------------------------------------------------------------------- - // - // Deprecated bi-allelic ~O(N) implementation. Kept here for posterity. - // - // ------------------------------------------------------------------------------------- - - /** - * A simple data structure that holds the current, prev, and prev->prev likelihoods vectors - * for the exact model calculation - */ -/* - private final static class ExactACCache { - double[] kMinus2, kMinus1, kMinus0; - - private final static double[] create(int n) { - return new double[n]; - } - - public ExactACCache(int n) { - kMinus2 = create(n); - kMinus1 = create(n); - kMinus0 = create(n); - } - - final public void rotate() { - double[] tmp = kMinus2; - kMinus2 = kMinus1; - kMinus1 = kMinus0; - kMinus0 = tmp; - } - - final public double[] getkMinus2() { - return kMinus2; - } - - final public double[] getkMinus1() { - return kMinus1; - } - - final public double[] getkMinus0() { - return kMinus0; - } - } - - public int linearExact(GenotypesContext GLs, - double[] log10AlleleFrequencyPriors, - double[][] log10AlleleFrequencyLikelihoods, - double[][] log10AlleleFrequencyPosteriors) { - final ArrayList genotypeLikelihoods = getGLs(GLs); - final int numSamples = genotypeLikelihoods.size()-1; - final int numChr = 2*numSamples; - - final ExactACCache logY = new ExactACCache(numSamples+1); - logY.getkMinus0()[0] = 0.0; // the zero case - - double maxLog10L = Double.NEGATIVE_INFINITY; - boolean done = false; - int lastK = -1; - - for (int k=0; k <= numChr && ! done; k++ ) { - final double[] kMinus0 = logY.getkMinus0(); - - if ( k == 0 ) { // special case for k = 0 - for ( int j=1; j <= numSamples; j++ ) { - kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0]; - } - } else { // k > 0 - final double[] kMinus1 = logY.getkMinus1(); - final double[] kMinus2 = logY.getkMinus2(); - - for ( int j=1; j <= numSamples; j++ ) { - final double[] gl = genotypeLikelihoods.get(j); - final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; - - double aa = Double.NEGATIVE_INFINITY; - double ab = Double.NEGATIVE_INFINITY; - if (k < 2*j-1) - aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0]; - - if (k < 2*j) - ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1]; - - double log10Max; - if (k > 1) { - final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2]; - log10Max = approximateLog10SumLog10(aa, ab, bb); - } else { - // we know we aren't considering the BB case, so we can use an optimized log10 function - log10Max = approximateLog10SumLog10(aa, ab); - } - - // finally, update the L(j,k) value - kMinus0[j] = log10Max - logDenominator; - } - } - - // update the posteriors vector - final double log10LofK = kMinus0[numSamples]; - log10AlleleFrequencyLikelihoods[0][k] = log10LofK; - log10AlleleFrequencyPosteriors[0][k] = log10LofK + log10AlleleFrequencyPriors[k]; - - // can we abort early? - lastK = k; - maxLog10L = Math.max(maxLog10L, log10LofK); - if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { - //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); - done = true; - } - - logY.rotate(); - } - - return lastK; - } - - final static double approximateLog10SumLog10(double a, double b, double c) { - return approximateLog10SumLog10(approximateLog10SumLog10(a, b), c); - } -*/ - -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index 6fdc926d5..ae9b01f2d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -103,7 +103,8 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser); + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap); protected int getFilteredDepth(ReadBackedPileup pileup) { @@ -115,4 +116,5 @@ public abstract class GenotypeLikelihoodsCalculationModel implements Cloneable { return count; } + } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index bedffa690..0d9f443e2 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -35,6 +35,7 @@ import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.variantcontext.*; @@ -48,29 +49,16 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood private boolean ignoreSNPAllelesWhenGenotypingIndels = false; private PairHMMIndelErrorModel pairModel; - private static ThreadLocal>> indelLikelihoodMap = - new ThreadLocal>>() { - protected synchronized HashMap> initialValue() { - return new HashMap>(); - } - }; private LinkedHashMap haplotypeMap; - // gdebug removeme - // todo -cleanup - private GenomeLoc lastSiteVisited; private List alleleList = new ArrayList(); - static { - indelLikelihoodMap.set(new HashMap>()); - } - protected IndelGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, - UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); + UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM); DEBUG = UAC.OUTPUT_DEBUG_INDEL_INFO; haplotypeMap = new LinkedHashMap(); ignoreSNPAllelesWhenGenotypingIndels = UAC.IGNORE_SNP_ALLELES; @@ -93,16 +81,15 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser) { + final GenomeLocParser locParser, + final Map perReadAlleleLikelihoodMap) { GenomeLoc loc = ref.getLocus(); // if (!ref.getLocus().equals(lastSiteVisited)) { if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { // starting a new site: clear allele list - lastSiteVisited = ref.getLocus(); - indelLikelihoodMap.set(new HashMap>()); haplotypeMap.clear(); - + perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods alleleList = getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC, ignoreSNPAllelesWhenGenotypingIndels); if (alleleList.isEmpty()) return null; @@ -130,10 +117,14 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood for (Map.Entry sample : contexts.entrySet()) { AlignmentContext context = AlignmentContextUtils.stratify(sample.getValue(), contextType); + if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){ + // no likelihoods have been computed for this sample at this site + perReadAlleleLikelihoodMap.put(sample.getKey(), PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap()); + } final ReadBackedPileup pileup = context.getBasePileup(); if (pileup != null) { final GenotypeBuilder b = new GenotypeBuilder(sample.getKey()); - final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, getIndelLikelihoodMap()); + final double[] genotypeLikelihoods = pairModel.computeDiploidReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap.get(sample.getKey()), UAC.CONTAMINATION_FRACTION, UAC.contaminationLog); b.PL(genotypeLikelihoods); b.DP(getFilteredDepth(pileup)); genotypes.add(b.make()); @@ -150,10 +141,6 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood return builder.genotypes(genotypes).make(); } - public static HashMap> getIndelLikelihoodMap() { - return indelLikelihoodMap.get(); - } - public static void getHaplotypeMapFromAlleles(final List alleleList, final ReferenceContext ref, final GenomeLoc loc, @@ -245,7 +232,7 @@ public class IndelGenotypeLikelihoodsCalculationModel extends GenotypeLikelihood int count = 0; for (PileupElement p : pileup) { if (p.isDeletion() || p.isInsertionAtBeginningOfRead() || BaseUtils.isRegularBase(p.getBase())) - count++; + count += p.getRepresentativeCount(); } return count; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 07d5d2f2d..791cdc325 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -36,24 +36,26 @@ import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.variantcontext.*; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; +import java.util.*; public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { private final boolean useAlleleFromVCF; private final double[] likelihoodSums = new double[4]; - + + private final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap; + protected SNPGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { super(UAC, logger); useAlleleFromVCF = UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; + perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap(); } public VariantContext getLikelihoods(final RefMetaDataTracker tracker, @@ -62,7 +64,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC final AlignmentContextUtils.ReadOrientation contextType, final List allAllelesToUse, final boolean useBAQedPileup, - final GenomeLocParser locParser) { + final GenomeLocParser locParser, + final Map sampleLikelihoodMap) { + + sampleLikelihoodMap.clear(); // not used in SNP model, sanity check to delete any older data final byte refBase = ref.getBase(); final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(refBase); @@ -75,8 +80,10 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC ArrayList GLs = new ArrayList(contexts.size()); for ( Map.Entry sample : contexts.entrySet() ) { ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); + if ( UAC.CONTAMINATION_FRACTION > 0.0 ) + pileup = perReadAlleleLikelihoodMap.createPerAlleleDownsampledBasePileup(pileup, UAC.CONTAMINATION_FRACTION, UAC.contaminationLog); if ( useBAQedPileup ) - pileup = createBAQedPileup( pileup ); + pileup = createBAQedPileup(pileup); // create the GenotypeLikelihoods object final DiploidSNPGenotypeLikelihoods GL = new DiploidSNPGenotypeLikelihoods(UAC.PCR_error); @@ -147,8 +154,6 @@ public class SNPGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsC // create the genotypes; no-call everyone for now final GenotypesContext genotypes = GenotypesContext.create(); - final List noCall = new ArrayList(); - noCall.add(Allele.NO_CALL); for ( SampleGenotypeData sampleData : GLs ) { final double[] allLikelihoods = sampleData.GL.getLikelihoods(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java index 30c0f3e18..5f6ddf0f1 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java @@ -27,22 +27,15 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - public class UnifiedArgumentCollection extends StandardCallerArgumentCollection { @Argument(fullName = "genotype_likelihoods_model", shortName = "glm", doc = "Genotype likelihoods calculation model to employ -- SNP is the default option, while INDEL is also available for calling indels and BOTH is available for calling both together", required = false) public GenotypeLikelihoodsCalculationModel.Model GLmodel = GenotypeLikelihoodsCalculationModel.Model.SNP; - /** - * Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus. - */ - @Advanced - @Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false) - protected AlleleFrequencyCalculationModel.Model AFmodel = AlleleFrequencyCalculationModel.Model.EXACT; - /** * The PCR error rate is independent of the sequencing error rate, which is necessary because we cannot necessarily * distinguish between PCR errors vs. sequencing errors. The practical implication for this value is that it @@ -54,8 +47,8 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection /** * Note that calculating the SLOD increases the runtime by an appreciable amount. */ - @Argument(fullName = "noSLOD", shortName = "nosl", doc = "If provided, we will not calculate the SLOD", required = false) - public boolean NO_SLOD = false; + @Argument(fullName = "computeSLOD", shortName = "slod", doc = "If provided, we will calculate the SLOD (SB annotation)", required = false) + public boolean COMPUTE_SLOD = false; /** * Depending on the value of the --max_alternate_alleles argument, we may genotype only a fraction of the alleles being sent on for genotyping. @@ -64,6 +57,12 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "annotateNDA", shortName = "nda", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", required = false) public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false; + /** + * The PairHMM implementation to use for -glm INDEL genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime. + */ + @Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for -glm INDEL genotype likelihood calculations", required = false) + public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.ORIGINAL; + /** * The minimum confidence needed in a given base for it to be used in variant calling. Note that the base quality of a base * is capped by the mapping quality so that bases on reads with low mapping quality may get filtered out depending on this value. @@ -75,10 +74,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "max_deletion_fraction", shortName = "deletions", doc = "Maximum fraction of reads with deletions spanning this locus for it to be callable [to disable, set to < 0 or > 1; default:0.05]", required = false) public Double MAX_DELETION_FRACTION = 0.05; - @Hidden - @Argument(fullName = "cap_max_alternate_alleles_for_indels", shortName = "capMaxAltAllelesForIndels", doc = "Cap the maximum number of alternate alleles to genotype for indel calls at 2; overrides the --max_alternate_alleles argument; GSA production use only", required = false) - public boolean CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = false; - // indel-related arguments /** * A candidate indel is genotyped (and potentially called) if there are this number of reads with a consensus indel at a site. @@ -115,10 +110,6 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(fullName = "indelHaplotypeSize", shortName = "indelHSize", doc = "Indel haplotype size", required = false) public int INDEL_HAPLOTYPE_SIZE = 80; - @Hidden - @Argument(fullName = "noBandedIndel", shortName = "noBandedIndel", doc = "Don't do Banded Indel likelihood computation", required = false) - public boolean DONT_DO_BANDED_INDEL_COMPUTATION = false; - @Hidden @Argument(fullName = "indelDebug", shortName = "indelDebug", doc = "Output indel debug info", required = false) public boolean OUTPUT_DEBUG_INDEL_INFO = false; @@ -160,7 +151,7 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection Sample ploidy - equivalent to number of chromosomes per pool. In pooled experiments this should be = # of samples in pool * individual sample ploidy */ @Argument(shortName="ploidy", fullName="sample_ploidy", doc="Plody (number of chromosomes) per sample. For pooled data, set to (Number of samples in each pool * Sample Ploidy).", required=false) - int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY; + public int samplePloidy = VariantContextUtils.DEFAULT_PLOIDY; @Hidden @Argument(shortName="minqs", fullName="min_quality_score", doc="Min quality score to consider. Smaller numbers process faster. Default: Q1.", required=false) @@ -186,61 +177,57 @@ public class UnifiedArgumentCollection extends StandardCallerArgumentCollection @Argument(shortName="ef", fullName="exclude_filtered_reference_sites", doc="Don't include in the analysis sites where the reference sample VCF is filtered. Default: false.", required=false) boolean EXCLUDE_FILTERED_REFERENCE_SITES = false; + /** + * Create a new UAC with defaults for all UAC arguments + */ + public UnifiedArgumentCollection() { + super(); + } - // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! - public UnifiedArgumentCollection clone() { - UnifiedArgumentCollection uac = new UnifiedArgumentCollection(); + /** + * Create a new UAC based on the information only our in super-class scac and defaults for all UAC arguments + * @param scac + */ + public UnifiedArgumentCollection(final StandardCallerArgumentCollection scac) { + super(scac); + } - uac.GLmodel = GLmodel; - uac.AFmodel = AFmodel; - uac.heterozygosity = heterozygosity; - uac.PCR_error = PCR_error; - uac.GenotypingMode = GenotypingMode; - uac.OutputMode = OutputMode; - uac.NO_SLOD = NO_SLOD; - uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; - uac.STANDARD_CONFIDENCE_FOR_CALLING = STANDARD_CONFIDENCE_FOR_CALLING; - uac.STANDARD_CONFIDENCE_FOR_EMITTING = STANDARD_CONFIDENCE_FOR_EMITTING; - uac.MIN_BASE_QUALTY_SCORE = MIN_BASE_QUALTY_SCORE; - uac.MAX_DELETION_FRACTION = MAX_DELETION_FRACTION; - uac.MIN_INDEL_COUNT_FOR_GENOTYPING = MIN_INDEL_COUNT_FOR_GENOTYPING; - uac.MIN_INDEL_FRACTION_PER_SAMPLE = MIN_INDEL_FRACTION_PER_SAMPLE; - uac.INDEL_HETEROZYGOSITY = INDEL_HETEROZYGOSITY; - uac.INDEL_GAP_OPEN_PENALTY = INDEL_GAP_OPEN_PENALTY; - uac.INDEL_GAP_CONTINUATION_PENALTY = INDEL_GAP_CONTINUATION_PENALTY; - uac.OUTPUT_DEBUG_INDEL_INFO = OUTPUT_DEBUG_INDEL_INFO; - uac.INDEL_HAPLOTYPE_SIZE = INDEL_HAPLOTYPE_SIZE; - uac.alleles = alleles; - uac.MAX_ALTERNATE_ALLELES = MAX_ALTERNATE_ALLELES; - uac.CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS = CAP_MAX_ALTERNATE_ALLELES_FOR_INDELS; - uac.GLmodel = GLmodel; - uac.TREAT_ALL_READS_AS_SINGLE_POOL = TREAT_ALL_READS_AS_SINGLE_POOL; - uac.referenceSampleRod = referenceSampleRod; - uac.referenceSampleName = referenceSampleName; - uac.samplePloidy = samplePloidy; - uac.maxQualityScore = minQualityScore; - uac.phredScaledPrior = phredScaledPrior; - uac.minPower = minPower; - uac.minReferenceDepth = minReferenceDepth; - uac.EXCLUDE_FILTERED_REFERENCE_SITES = EXCLUDE_FILTERED_REFERENCE_SITES; - uac.IGNORE_LANE_INFO = IGNORE_LANE_INFO; + /** + * Create a new UAC with all parameters having the values in uac + * + * @param uac + */ + public UnifiedArgumentCollection(final UnifiedArgumentCollection uac) { + // Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value! + super(uac); + + this.GLmodel = uac.GLmodel; + this.AFmodel = uac.AFmodel; + this.PCR_error = uac.PCR_error; + this.COMPUTE_SLOD = uac.COMPUTE_SLOD; + this.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = uac.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED; + this.MIN_BASE_QUALTY_SCORE = uac.MIN_BASE_QUALTY_SCORE; + this.MAX_DELETION_FRACTION = uac.MAX_DELETION_FRACTION; + this.MIN_INDEL_COUNT_FOR_GENOTYPING = uac.MIN_INDEL_COUNT_FOR_GENOTYPING; + this.MIN_INDEL_FRACTION_PER_SAMPLE = uac.MIN_INDEL_FRACTION_PER_SAMPLE; + this.INDEL_HETEROZYGOSITY = uac.INDEL_HETEROZYGOSITY; + this.INDEL_GAP_OPEN_PENALTY = uac.INDEL_GAP_OPEN_PENALTY; + this.INDEL_GAP_CONTINUATION_PENALTY = uac.INDEL_GAP_CONTINUATION_PENALTY; + this.OUTPUT_DEBUG_INDEL_INFO = uac.OUTPUT_DEBUG_INDEL_INFO; + this.INDEL_HAPLOTYPE_SIZE = uac.INDEL_HAPLOTYPE_SIZE; + this.TREAT_ALL_READS_AS_SINGLE_POOL = uac.TREAT_ALL_READS_AS_SINGLE_POOL; + this.referenceSampleRod = uac.referenceSampleRod; + this.referenceSampleName = uac.referenceSampleName; + this.samplePloidy = uac.samplePloidy; + this.maxQualityScore = uac.minQualityScore; + this.phredScaledPrior = uac.phredScaledPrior; + this.minPower = uac.minPower; + this.minReferenceDepth = uac.minReferenceDepth; + this.EXCLUDE_FILTERED_REFERENCE_SITES = uac.EXCLUDE_FILTERED_REFERENCE_SITES; + this.IGNORE_LANE_INFO = uac.IGNORE_LANE_INFO; + this.pairHMM = uac.pairHMM; // todo- arguments to remove - uac.IGNORE_SNP_ALLELES = IGNORE_SNP_ALLELES; - uac.DONT_DO_BANDED_INDEL_COMPUTATION = DONT_DO_BANDED_INDEL_COMPUTATION; - return uac; - } - - public UnifiedArgumentCollection() { } - - public UnifiedArgumentCollection( final StandardCallerArgumentCollection SCAC ) { - super(); - this.alleles = SCAC.alleles; - this.GenotypingMode = SCAC.GenotypingMode; - this.heterozygosity = SCAC.heterozygosity; - this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES; - this.OutputMode = SCAC.OutputMode; - this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING; - this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING; + this.IGNORE_SNP_ALLELES = uac.IGNORE_SNP_ALLELES; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java index 507806fbe..36be2e7c6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyper.java @@ -27,12 +27,14 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; -import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; +import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.filters.MappingQualityUnavailableFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; @@ -117,14 +119,14 @@ import java.util.*; */ @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_INPUT) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_INPUT) @ReadFilters( {BadMateFilter.class, MappingQualityUnavailableFilter.class} ) @Reference(window=@Window(start=-200,stop=200)) @By(DataSource.REFERENCE) // TODO -- When LocusIteratorByState gets cleaned up, we should enable multiple @By sources: // TODO -- @By( {DataSource.READS, DataSource.REFERENCE_ORDERED_DATA} ) @Downsample(by=DownsampleType.BY_SAMPLE, toCoverage=250) -public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatible { +public class UnifiedGenotyper extends LocusWalker, UnifiedGenotyper.UGStatistics> implements TreeReducible, AnnotatorCompatible, NanoSchedulable { @ArgumentCollection private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection(); @@ -233,36 +235,26 @@ public class UnifiedGenotyper extends LocusWalker, Unif if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY || UAC.referenceSampleName != null || UAC.referenceSampleRod.isBound()) { - throw new UserException.NotSupportedInGATKLite("Usage of ploidy values different than 2 not supported in this GATK version"); + throw new UserException.NotSupportedInGATKLite("you cannot enable usage of ploidy values other than 2"); } + + if ( UAC.CONTAMINATION_FRACTION > 0.0 ) { + if ( UAC.CONTAMINATION_FRACTION == StandardCallerArgumentCollection.DEFAULT_CONTAMINATION_FRACTION ) { + UAC.CONTAMINATION_FRACTION = 0.0; + logger.warn("setting contamination down-sampling fraction to 0.0 because it is not enabled in GATK-lite"); + } else { + throw new UserException.NotSupportedInGATKLite("you cannot enable usage of contamination down-sampling"); + } + } + } + + if ( UAC.TREAT_ALL_READS_AS_SINGLE_POOL ) { + samples.add(GenotypeLikelihoodsCalculationModel.DUMMY_SAMPLE_NAME); + } else { // get all of the unique sample names samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - - } else { - // in full mode: check for consistency in ploidy/pool calling arguments - // check for correct calculation models -/* if (UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) { - // polyploidy requires POOL GL and AF calculation models to be specified right now - if (UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLSNP && UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLINDEL - && UAC.GLmodel != GenotypeLikelihoodsCalculationModel.Model.POOLBOTH) { - throw new UserException("Incorrect genotype calculation model chosen. Only [POOLSNP|POOLINDEL|POOLBOTH] supported with this walker if sample ploidy != 2"); - } - - if (UAC.AFmodel != AlleleFrequencyCalculationModel.Model.POOL) - throw new UserException("Incorrect AF Calculation model. Only POOL model supported if sample ploidy != 2"); - - } - */ - // get all of the unique sample names - if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) { - samples.clear(); - samples.add(GenotypeLikelihoodsCalculationModel.DUMMY_SAMPLE_NAME); - } else { - samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader()); - if (UAC.referenceSampleName != null ) - samples.remove(UAC.referenceSampleName); - } - + if ( UAC.referenceSampleName != null ) + samples.remove(UAC.referenceSampleName); } // check for a bad max alleles value @@ -304,7 +296,7 @@ public class UnifiedGenotyper extends LocusWalker, Unif headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions()); // annotation (INFO) fields from UnifiedGenotyper - if ( !UAC.NO_SLOD ) + if ( UAC.COMPUTE_SLOD ) VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true, VCFConstants.STRAND_BIAS_KEY); if ( UAC.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 3d9724ffb..97254c478 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -34,11 +34,13 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalc; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory; +import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcResult; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -79,11 +81,7 @@ public class UnifiedGenotyperEngine { private ThreadLocal> glcm = new ThreadLocal>(); // the model used for calculating p(non-ref) - private ThreadLocal afcm = new ThreadLocal(); - - // the allele frequency likelihoods and posteriors (allocated once as an optimization) - private ThreadLocal alleleFrequencyCalculationResult = new ThreadLocal(); - private ThreadLocal posteriorsArray = new ThreadLocal(); + private ThreadLocal afcm = new ThreadLocal(); // because the allele frequency priors are constant for a given i, we cache the results to avoid having to recompute everything private final double[] log10AlleleFrequencyPriorsSNPs; @@ -106,8 +104,6 @@ public class UnifiedGenotyperEngine { private final GenomeLocParser genomeLocParser; private final boolean BAQEnabledOnCMDLine; - protected static final double SUM_GL_THRESH_NOCALL = VariantContextUtils.SUM_GL_THRESH_NOCALL; - // --------------------------------------------------------------------------------------------------------- // // Public interface functions @@ -157,19 +153,19 @@ public class UnifiedGenotyperEngine { } - /** - * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. - * - * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype - * for every sample in allSamples. If it's null there's no such guarentee. Providing this - * argument is critical when the resulting calls will be written to a VCF file. - * - * @param tracker the meta data tracker - * @param refContext the reference base - * @param rawContext contextual information around the locus - * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) - * @return the VariantCallContext object - */ + /** + * Compute full calls at a given locus. Entry point for engine calls from the UnifiedGenotyper. + * + * If allSamples != null, then the output variantCallContext is guarenteed to contain a genotype + * for every sample in allSamples. If it's null there's no such guarentee. Providing this + * argument is critical when the resulting calls will be written to a VCF file. + * + * @param tracker the meta data tracker + * @param refContext the reference base + * @param rawContext contextual information around the locus + * @param allSamples set of all sample names that we might call (i.e., those in the VCF header) + * @return the VariantCallContext object + */ public List calculateLikelihoodsAndGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, final AlignmentContext rawContext, @@ -177,40 +173,28 @@ public class UnifiedGenotyperEngine { final List results = new ArrayList(2); final List models = getGLModelsToUse(tracker, refContext, rawContext); + + final Map perReadAlleleLikelihoodMap = new HashMap(); + if ( models.isEmpty() ) { results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); } else { for ( final GenotypeLikelihoodsCalculationModel.Model model : models ) { + perReadAlleleLikelihoodMap.clear(); final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); if ( stratifiedContexts == null ) { results.add(UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES && UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? generateEmptyContext(tracker, refContext, null, rawContext) : null); } else { - final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); + final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) - results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true)); + results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); } } } - return addMissingSamples(results, allSamples); - } - - private List addMissingSamples(final List calls, final Set allSamples) { - if ( calls.isEmpty() || allSamples == null ) return calls; - - final List withAllSamples = new ArrayList(calls.size()); - for ( final VariantCallContext call : calls ) { - if ( call == null ) - withAllSamples.add(null); - else { - final VariantContext withoutMissing = VariantContextUtils.addMissingSamples(call, allSamples); - withAllSamples.add(new VariantCallContext(withoutMissing, call.confidentlyCalled, call.shouldEmit)); - } - } - - return withAllSamples; + return results; } /** @@ -219,9 +203,13 @@ public class UnifiedGenotyperEngine { * @param tracker the meta data tracker * @param refContext the reference base * @param rawContext contextual information around the locus + * @param perReadAlleleLikelihoodMap Map to store per-sample, per-read, per-allele likelihoods (only used for indels) * @return the VariantContext object */ - public VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext) { + public VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map perReadAlleleLikelihoodMap) { final List models = getGLModelsToUse(tracker, refContext, rawContext); if ( models.isEmpty() ) { return null; @@ -231,7 +219,7 @@ public class UnifiedGenotyperEngine { final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); // return the first valid one we encounter if ( stratifiedContexts != null ) - return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model); + return calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); } @@ -247,7 +235,10 @@ public class UnifiedGenotyperEngine { * @param vc the GL-annotated variant context * @return the VariantCallContext object */ - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, VariantContext vc) { + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final VariantContext vc) { final List models = getGLModelsToUse(tracker, refContext, rawContext); if ( models.isEmpty() ) { return null; @@ -256,7 +247,7 @@ public class UnifiedGenotyperEngine { // return the first one final GenotypeLikelihoodsCalculationModel.Model model = models.get(0); final Map stratifiedContexts = getFilteredAndStratifiedContexts(UAC, refContext, rawContext, model); - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model); + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, null); } /** @@ -266,7 +257,7 @@ public class UnifiedGenotyperEngine { * @return the VariantCallContext object */ public VariantCallContext calculateGenotypes(VariantContext vc) { - return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), false); + return calculateGenotypes(null, null, null, null, vc, GenotypeLikelihoodsCalculationModel.Model.valueOf("SNP"), null); } @@ -277,14 +268,21 @@ public class UnifiedGenotyperEngine { // --------------------------------------------------------------------------------------------------------- // private method called by both UnifiedGenotyper and UGCalcLikelihoods entry points into the engine - private VariantContext calculateLikelihoods(RefMetaDataTracker tracker, ReferenceContext refContext, Map stratifiedContexts, AlignmentContextUtils.ReadOrientation type, List alternateAllelesToUse, boolean useBAQedPileup, final GenotypeLikelihoodsCalculationModel.Model model) { + private VariantContext calculateLikelihoods(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final Map stratifiedContexts, + final AlignmentContextUtils.ReadOrientation type, + final List alternateAllelesToUse, + final boolean useBAQedPileup, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { // initialize the data for this thread if that hasn't been done yet if ( glcm.get() == null ) { glcm.set(getGenotypeLikelihoodsCalculationObject(logger, UAC)); } - return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser); + return glcm.get().get(model.name().toUpperCase()).getLikelihoods(tracker, refContext, stratifiedContexts, type, alternateAllelesToUse, useBAQedPileup && BAQEnabledOnCMDLine, genomeLocParser, perReadAlleleLikelihoodMap); } private VariantCallContext generateEmptyContext(RefMetaDataTracker tracker, ReferenceContext ref, Map stratifiedContexts, AlignmentContext rawContext) { @@ -315,12 +313,22 @@ public class UnifiedGenotyperEngine { return new VariantCallContext(vc, false); } - public VariantCallContext calculateGenotypes(VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { - return calculateGenotypes(null, null, null, null, vc, model); + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(null, null, null, null, vc, model, perReadAlleleLikelihoodMap); } - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { - return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false); + public VariantCallContext calculateGenotypes(final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model) { + return calculateGenotypes(null, null, null, null, vc, model, null); + } + + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, + final ReferenceContext refContext, + final AlignmentContext rawContext, + final Map stratifiedContexts, + final VariantContext vc, + final GenotypeLikelihoodsCalculationModel.Model model, + final Map perReadAlleleLikelihoodMap) { + return calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, false,perReadAlleleLikelihoodMap); } /** @@ -334,18 +342,18 @@ public class UnifiedGenotyperEngine { * @param inheritAttributesFromInputVC Output VC will contain attributes inherited from input vc * @return VC with assigned genotypes */ - public VariantCallContext calculateGenotypes(RefMetaDataTracker tracker, ReferenceContext refContext, AlignmentContext rawContext, Map stratifiedContexts, VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, - final boolean inheritAttributesFromInputVC) { + public VariantCallContext calculateGenotypes(final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext rawContext, Map stratifiedContexts, + final VariantContext vc, final GenotypeLikelihoodsCalculationModel.Model model, + final boolean inheritAttributesFromInputVC, + final Map perReadAlleleLikelihoodMap) { boolean limitedContext = tracker == null || refContext == null || rawContext == null || stratifiedContexts == null; // initialize the data for this thread if that hasn't been done yet if ( afcm.get() == null ) { - afcm.set(getAlleleFrequencyCalculationObject(N, logger, verboseWriter, UAC)); - alleleFrequencyCalculationResult.set(new AlleleFrequencyCalculationResult(UAC.MAX_ALTERNATE_ALLELES)); - posteriorsArray.set(new double[2]); + afcm.set(AFCalcFactory.createAFCalc(UAC, N, logger)); } - AlleleFrequencyCalculationResult AFresult = alleleFrequencyCalculationResult.get(); // estimate our confidence in a reference call and return if ( vc.getNSamples() == 0 ) { @@ -356,8 +364,7 @@ public class UnifiedGenotyperEngine { generateEmptyContext(tracker, refContext, stratifiedContexts, rawContext)); } - AFresult.reset(); - List allelesUsedInGenotyping = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model), AFresult); + AFCalcResult AFresult = afcm.get().getLog10PNonRef(vc, getAlleleFrequencyPriors(model)); // is the most likely frequency conformation AC=0 for all alternate alleles? boolean bestGuessIsRef = true; @@ -366,50 +373,43 @@ public class UnifiedGenotyperEngine { final List myAlleles = new ArrayList(vc.getAlleles().size()); final List alleleCountsofMLE = new ArrayList(vc.getAlleles().size()); myAlleles.add(vc.getReference()); - for ( int i = 0; i < vc.getAlternateAlleles().size(); i++ ) { - final Allele alternateAllele = vc.getAlternateAllele(i); - final int indexOfAllele = allelesUsedInGenotyping.indexOf(alternateAllele); - // the genotyping model may have stripped it out - if ( indexOfAllele == -1 ) + for ( int i = 0; i < AFresult.getAllelesUsedInGenotyping().size(); i++ ) { + final Allele alternateAllele = AFresult.getAllelesUsedInGenotyping().get(i); + if ( alternateAllele.isReference() ) continue; - final int indexOfBestAC = AFresult.getAlleleCountsOfMAP()[indexOfAllele-1]; + // we are non-ref if the probability of being non-ref > the emit confidence. + // the emit confidence is phred-scaled, say 30 => 10^-3. + // the posterior AF > 0 is log10: -5 => 10^-5 + // we are non-ref if 10^-5 < 10^-3 => -5 < -3 + final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); // if the most likely AC is not 0, then this is a good alternate allele to use - if ( indexOfBestAC != 0 ) { + if ( isNonRef ) { myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountsOfMLE()[indexOfAllele-1]); + alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); bestGuessIsRef = false; } // if in GENOTYPE_GIVEN_ALLELES mode, we still want to allow the use of a poor allele else if ( UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { myAlleles.add(alternateAllele); - alleleCountsofMLE.add(AFresult.getAlleleCountsOfMLE()[indexOfAllele-1]); + alleleCountsofMLE.add(AFresult.getAlleleCountAtMLE(alternateAllele)); } } - // calculate p(f>0): - final double[] normalizedPosteriors = generateNormalizedPosteriors(AFresult, posteriorsArray.get()); - final double PofF = 1.0 - normalizedPosteriors[0]; + final double PoFGT0 = Math.pow(10, AFresult.getLog10PosteriorOfAFGT0()); - double phredScaledConfidence; - if ( !bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(normalizedPosteriors[0]); - if ( Double.isInfinite(phredScaledConfidence) ) - phredScaledConfidence = -10.0 * AFresult.getLog10PosteriorOfAFzero(); - } else { - phredScaledConfidence = QualityUtils.phredScaleErrorRate(PofF); - if ( Double.isInfinite(phredScaledConfidence) ) { - final double sum = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); - phredScaledConfidence = (MathUtils.compareDoubles(sum, 0.0) == 0 ? 0 : -10.0 * sum); - } - } + // note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice + final double phredScaledConfidence = + Math.abs(! bestGuessIsRef || UAC.GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES + ? -10 * AFresult.getLog10PosteriorOfAFEq0() + : -10 * AFresult.getLog10PosteriorOfAFGT0()); // return a null call if we don't pass the confidence cutoff or the most likely allele frequency is zero if ( UAC.OutputMode != OUTPUT_MODE.EMIT_ALL_SITES && !passesEmitThreshold(phredScaledConfidence, bestGuessIsRef) ) { // technically, at this point our confidence in a reference call isn't accurately estimated // because it didn't take into account samples with no data, so let's get a better estimate - return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, 1.0 - PofF); + return limitedContext ? null : estimateReferenceConfidence(vc, stratifiedContexts, getTheta(model), true, PoFGT0); } // start constructing the resulting VC @@ -425,7 +425,7 @@ public class UnifiedGenotyperEngine { // print out stats if we have a writer if ( verboseWriter != null && !limitedContext ) - printVerboseData(refContext.getLocus().toString(), vc, PofF, phredScaledConfidence, model); + printVerboseData(refContext.getLocus().toString(), vc, PoFGT0, phredScaledConfidence, model); // *** note that calculating strand bias involves overwriting data structures, so we do that last final HashMap attributes = new HashMap(); @@ -451,32 +451,30 @@ public class UnifiedGenotyperEngine { attributes.put(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, MLEfrequencies); } - if ( !UAC.NO_SLOD && !limitedContext && !bestGuessIsRef ) { + if ( UAC.COMPUTE_SLOD && !limitedContext && !bestGuessIsRef ) { //final boolean DEBUG_SLOD = false; // the overall lod //double overallLog10PofNull = AFresult.log10AlleleFrequencyPosteriors[0]; - double overallLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); + double overallLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); //if ( DEBUG_SLOD ) System.out.println("overallLog10PofF=" + overallLog10PofF); List allAllelesToUse = builder.make().getAlleles(); // the forward lod - VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model); - AFresult.reset(); - afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model), AFresult); + VariantContext vcForward = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.FORWARD, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + AFresult = afcm.get().getLog10PNonRef(vcForward, getAlleleFrequencyPriors(model)); //double[] normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double forwardLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); - double forwardLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); + double forwardLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); + double forwardLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); //if ( DEBUG_SLOD ) System.out.println("forwardLog10PofNull=" + forwardLog10PofNull + ", forwardLog10PofF=" + forwardLog10PofF); // the reverse lod - VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model); - AFresult.reset(); - afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model), AFresult); + VariantContext vcReverse = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.REVERSE, allAllelesToUse, false, model, perReadAlleleLikelihoodMap); + AFresult = afcm.get().getLog10PNonRef(vcReverse, getAlleleFrequencyPriors(model)); //normalizedLog10Posteriors = MathUtils.normalizeFromLog10(AFresult.log10AlleleFrequencyPosteriors, true); - double reverseLog10PofNull = AFresult.getLog10PosteriorOfAFzero(); - double reverseLog10PofF = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); + double reverseLog10PofNull = AFresult.getLog10LikelihoodOfAFEq0(); + double reverseLog10PofF = AFresult.getLog10LikelihoodOfAFGT0(); //if ( DEBUG_SLOD ) System.out.println("reverseLog10PofNull=" + reverseLog10PofNull + ", reverseLog10PofF=" + reverseLog10PofF); double forwardLod = forwardLog10PofF + reverseLog10PofNull - overallLog10PofF; @@ -499,24 +497,18 @@ public class UnifiedGenotyperEngine { // if we are subsetting alleles (either because there were too many or because some were not polymorphic) // then we may need to trim the alleles (because the original VariantContext may have had to pad at the end). - if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) + if ( myAlleles.size() != vc.getAlleles().size() && !limitedContext ) // limitedContext callers need to handle allele trimming on their own to keep their perReadAlleleLikelihoodMap alleles in sync vcCall = VariantContextUtils.reverseTrimAlleles(vcCall); - if ( annotationEngine != null && !limitedContext ) { + if ( annotationEngine != null && !limitedContext ) { // limitedContext callers need to handle annotations on their own by calling their own annotationEngine // Note: we want to use the *unfiltered* and *unBAQed* context for the annotations final ReadBackedPileup pileup = rawContext.getBasePileup(); stratifiedContexts = AlignmentContextUtils.splitContextBySampleName(pileup); - vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall); + vcCall = annotationEngine.annotateContext(tracker, refContext, stratifiedContexts, vcCall, perReadAlleleLikelihoodMap); } - return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PofF)); - } - - public static double[] generateNormalizedPosteriors(final AlleleFrequencyCalculationResult AFresult, final double[] normalizedPosteriors) { - normalizedPosteriors[0] = AFresult.getLog10PosteriorOfAFzero(); - normalizedPosteriors[1] = AFresult.getLog10PosteriorsMatrixSumWithoutAFzero(); - return MathUtils.normalizeFromLog10(normalizedPosteriors); + return new VariantCallContext(vcCall, confidentlyCalled(phredScaledConfidence, PoFGT0)); } private Map getFilteredAndStratifiedContexts(UnifiedArgumentCollection UAC, ReferenceContext refContext, AlignmentContext rawContext, final GenotypeLikelihoodsCalculationModel.Model model) { @@ -619,8 +611,6 @@ public class UnifiedGenotyperEngine { AFline.append(i + "/" + N + "\t"); AFline.append(String.format("%.2f\t", ((float)i)/N)); AFline.append(String.format("%.8f\t", getAlleleFrequencyPriors(model)[i])); - AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MLE())); - AFline.append(String.format("%.8f\t", alleleFrequencyCalculationResult.get().getLog10MAP())); verboseWriter.println(AFline.toString()); } @@ -686,7 +676,7 @@ public class UnifiedGenotyperEngine { return models; } - protected static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { + public static void computeAlleleFrequencyPriors(final int N, final double[] priors, final double theta) { double sum = 0.0; @@ -740,34 +730,6 @@ public class UnifiedGenotyperEngine { return glcm; } - private static AlleleFrequencyCalculationModel getAlleleFrequencyCalculationObject(int N, Logger logger, PrintStream verboseWriter, UnifiedArgumentCollection UAC) { - - List> afClasses = new PluginManager(AlleleFrequencyCalculationModel.class).getPlugins(); - - // user-specified name - String afModelName = UAC.AFmodel.name(); - - if (!afModelName.contains(GPSTRING) && UAC.samplePloidy != VariantContextUtils.DEFAULT_PLOIDY) - afModelName = GPSTRING + afModelName; - - for (int i = 0; i < afClasses.size(); i++) { - Class afClass = afClasses.get(i); - String key = afClass.getSimpleName().replace("AFCalculationModel","").toUpperCase(); - if (afModelName.equalsIgnoreCase(key)) { - try { - Object args[] = new Object[]{UAC,N,logger,verboseWriter}; - Constructor c = afClass.getDeclaredConstructor(UnifiedArgumentCollection.class, int.class, Logger.class, PrintStream.class); - - return (AlleleFrequencyCalculationModel)c.newInstance(args); - } - catch (Exception e) { - throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); - } - } - } - throw new IllegalArgumentException("Unexpected AlleleFrequencyCalculationModel " + UAC.AFmodel); - } - public static VariantContext getVCFromAllelesRod(RefMetaDataTracker tracker, ReferenceContext ref, GenomeLoc loc, boolean requireSNP, Logger logger, final RodBinding allelesBinding) { if ( tracker == null || ref == null || logger == null ) return null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java new file mode 100755 index 000000000..f783267bc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalc.java @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.io.File; +import java.util.List; + + +/** + * Generic interface for calculating the probability of alleles segregating given priors and genotype likelihoods + */ +public abstract class AFCalc implements Cloneable { + private final static Logger defaultLogger = Logger.getLogger(AFCalc.class); + + protected final int nSamples; + protected final int maxAlternateAllelesToGenotype; + + protected Logger logger = defaultLogger; + + private SimpleTimer callTimer = new SimpleTimer(); + private final StateTracker stateTracker; + private ExactCallLogger exactCallLogger = null; + + /** + * Create a new AFCalc object capable of calculating the prob. that alleles are + * segregating among nSamples with up to maxAltAlleles for SNPs and maxAltAllelesForIndels + * for indels for samples with ploidy + * + * @param nSamples number of samples, must be > 0 + * @param maxAltAlleles maxAltAlleles for SNPs + * @param ploidy the ploidy, must be > 0 + */ + protected AFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); + if ( ploidy < 1 ) throw new IllegalArgumentException("ploidy must be > 0 but got " + ploidy); + + this.nSamples = nSamples; + this.maxAlternateAllelesToGenotype = maxAltAlleles; + this.stateTracker = new StateTracker(maxAltAlleles); + } + + /** + * Enable exact call logging to file + * + * @param exactCallsLog the destination file + */ + public void enableProcessLog(final File exactCallsLog) { + exactCallLogger = new ExactCallLogger(exactCallsLog); + } + + /** + * Use this logger instead of the default logger + * + * @param logger + */ + public void setLogger(Logger logger) { + this.logger = logger; + } + + /** + * Compute the probability of the alleles segregating given the genotype likelihoods of the samples in vc + * + * @param vc the VariantContext holding the alleles and sample information + * @param log10AlleleFrequencyPriors a prior vector nSamples x 2 in length indicating the Pr(AF = i) + * @return result (for programming convenience) + */ + public AFCalcResult getLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) { + if ( vc == null ) throw new IllegalArgumentException("VariantContext cannot be null"); + if ( log10AlleleFrequencyPriors == null ) throw new IllegalArgumentException("priors vector cannot be null"); + if ( stateTracker == null ) throw new IllegalArgumentException("Results object cannot be null"); + + // reset the result, so we can store our new result there + stateTracker.reset(); + + final VariantContext vcWorking = reduceScope(vc); + + callTimer.start(); + final AFCalcResult result = computeLog10PNonRef(vcWorking, log10AlleleFrequencyPriors); + final long nanoTime = callTimer.getElapsedTimeNano(); + + if ( exactCallLogger != null ) + exactCallLogger.printCallInfo(vcWorking, log10AlleleFrequencyPriors, nanoTime, result); + + return result; + } + + /** + * Convert the final state of the state tracker into our result as an AFCalcResult + * + * Assumes that stateTracker has been updated accordingly + * + * @param vcWorking the VariantContext we actually used as input to the calc model (after reduction) + * @param log10AlleleFrequencyPriors the priors by AC vector + * @return a AFCalcResult describing the result of this calculation + */ + @Requires("stateTracker.getnEvaluations() >= 0") + @Ensures("result != null") + protected AFCalcResult getResultFromFinalState(final VariantContext vcWorking, final double[] log10AlleleFrequencyPriors) { + stateTracker.setAllelesUsedInGenotyping(vcWorking.getAlleles()); + return stateTracker.toAFCalcResult(log10AlleleFrequencyPriors); + } + + // --------------------------------------------------------------------------- + // + // Abstract methods that should be implemented by concrete implementations + // to actually calculate the AF + // + // --------------------------------------------------------------------------- + + /** + * Look at VC and perhaps return a new one of reduced complexity, if that's necessary + * + * Used before the call to computeLog10PNonRef to simply the calculation job at hand, + * if vc exceeds bounds. For example, if VC has 100 alt alleles this function + * may decide to only genotype the best 2 of them. + * + * @param vc the initial VC provided by the caller to this AFcalculation + * @return a potentially simpler VC that's more tractable to genotype + */ + @Requires("vc != null") + @Ensures("result != null") + protected abstract VariantContext reduceScope(final VariantContext vc); + + /** + * Actually carry out the log10PNonRef calculation on vc, storing results in results + * + * @param vc variant context with alleles and genotype likelihoods + * @param log10AlleleFrequencyPriors priors + * @return a AFCalcResult object describing the results of this calculation + */ + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) + protected abstract AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors); + + /** + * Subset VC to the just allelesToUse, updating genotype likelihoods + * + * Must be overridden by concrete subclasses + * + * @param vc variant context with alleles and genotype likelihoods + * @param allelesToUse alleles to subset + * @param assignGenotypes + * @param ploidy + * @return GenotypesContext object + */ + public abstract GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy); + + // --------------------------------------------------------------------------- + // + // accessors + // + // --------------------------------------------------------------------------- + + public int getMaxAltAlleles() { + return maxAlternateAllelesToGenotype; + } + + protected StateTracker getStateTracker() { + return stateTracker; + } + +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java new file mode 100644 index 000000000..efb16101e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcFactory.java @@ -0,0 +1,216 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.classloader.PluginManager; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import java.lang.reflect.Constructor; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +/** + * Factory to make AFCalculations + */ +public class AFCalcFactory { + /** + * Enumeration of usable AF calculation, their constraints (i.e. ploidy). + * + * Note that the order these occur in the enum is the order of preference, so + * the first value is taken over the second when multiple calculations satisfy + * the needs of the request (i.e., considering ploidy). + */ + public enum Calculation { + /** expt. implementation -- for testing only */ + EXACT_INDEPENDENT(IndependentAllelesDiploidExactAFCalc.class, 2, -1), + + /** reference implementation of multi-allelic EXACT model. Extremely slow for many alternate alleles */ + EXACT_REFERENCE(ReferenceDiploidExactAFCalc.class, 2, -1), + + /** original biallelic exact model, for testing only */ + EXACT_ORIGINAL(OriginalDiploidExactAFCalc.class, 2, 2), + + /** implementation that supports any sample ploidy */ + EXACT_GENERAL_PLOIDY("GeneralPloidyExactAFCalc", -1, -1); + + /** + * Must be a name because we look this up dynamically + */ + public final String className; + public final int maxAltAlleles; + public final int requiredPloidy; + + private Calculation(final String className, final int requiredPloidy, final int maxAltAlleles) { + this.className = className; + this.requiredPloidy = requiredPloidy; + this.maxAltAlleles = maxAltAlleles; + } + + private Calculation(final Class clazz, final int requiredPloidy, final int maxAltAlleles) { + this(clazz.getSimpleName(), requiredPloidy, maxAltAlleles); + } + + public boolean usableForParams(final int requestedPloidy, final int requestedMaxAltAlleles) { + return (requiredPloidy == -1 || requiredPloidy == requestedPloidy) + && (maxAltAlleles == -1 || maxAltAlleles >= requestedMaxAltAlleles); + } + + public static Calculation getDefaultModel() { return EXACT_INDEPENDENT; } + } + + private static final Map> afClasses; + static { + afClasses = new PluginManager(AFCalc.class).getPluginsByName(); + } + + private AFCalcFactory() { + + } + + private static Class getClassByName(final String name) { + for ( final Class clazz : afClasses.values() ) { + if ( clazz.getSimpleName().contains(name) ) { + return clazz; + } + } + + return null; + } + + /** + * Create a new AFCalc based on the parameters in the UAC + * + * @param UAC the UnifiedArgumentCollection containing the command-line parameters for the caller + * @param nSamples the number of samples we will be using + * @param logger an optional (can be null) logger to override the default in the model + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final UnifiedArgumentCollection UAC, + final int nSamples, + final Logger logger) { + final int maxAltAlleles = UAC.MAX_ALTERNATE_ALLELES; + if ( ! UAC.AFmodel.usableForParams(UAC.samplePloidy, maxAltAlleles) ) { + logger.info("Requested ploidy " + UAC.samplePloidy + " maxAltAlleles " + maxAltAlleles + " not supported by requested model " + UAC.AFmodel + " looking for an option"); + final List supportingCalculations = new LinkedList(); + for ( final Calculation calc : Calculation.values() ) { + if ( calc.usableForParams(UAC.samplePloidy, maxAltAlleles) ) + supportingCalculations.add(calc); + } + + if ( supportingCalculations.isEmpty() ) + throw new UserException("no AFCalculation model found that supports ploidy of " + UAC.samplePloidy + " and max alt alleles " + maxAltAlleles); + else if ( supportingCalculations.size() > 1 ) + logger.debug("Warning, multiple supporting AFCalcs found " + Utils.join(",", supportingCalculations) + " choosing first arbitrarily"); + else + UAC.AFmodel = supportingCalculations.get(0); + logger.info("Selecting model " + UAC.AFmodel); + } + + final AFCalc calc = createAFCalc(UAC.AFmodel, nSamples, UAC.MAX_ALTERNATE_ALLELES, UAC.samplePloidy); + + if ( logger != null ) calc.setLogger(logger); + if ( UAC.exactCallsLog != null ) calc.enableProcessLog(UAC.exactCallsLog); + + return calc; + } + + /** + * Create a new AFCalc, choosing the best implementation based on the given parameters, assuming + * that we will only be requesting bi-allelic variants to diploid genotypes + * + * @param nSamples the number of samples we'll be using + * + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final int nSamples) { + return createAFCalc(chooseBestCalculation(nSamples, 2, 1), nSamples, 2, 2); + } + + /** + * Create a new AFCalc that supports maxAltAlleles for all variants and diploid genotypes + * + * @param calc the calculation we'd like to use + * @param nSamples the number of samples we'll be using + * @param maxAltAlleles the max. alt alleles for both SNPs and indels + * + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles) { + return createAFCalc(calc, nSamples, maxAltAlleles, 2); + } + + /** + * Create a new AFCalc, choosing the best implementation based on the given parameters + * + * @param nSamples the number of samples we'll be using + * @param maxAltAlleles the max. alt alleles to consider for SNPs + * @param ploidy the sample ploidy. Must be consistent with the calc + * + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + return createAFCalc(chooseBestCalculation(nSamples, ploidy, maxAltAlleles), nSamples, maxAltAlleles, ploidy); + } + + /** + * Choose the best calculation for nSamples and ploidy + * + * @param nSamples + * @param ploidy + * @param maxAltAlleles + * @return + */ + private static Calculation chooseBestCalculation(final int nSamples, final int ploidy, final int maxAltAlleles) { + for ( final Calculation calc : Calculation.values() ) { + if ( calc.usableForParams(ploidy, maxAltAlleles) ) { + return calc; + } + } + + throw new IllegalStateException("no calculation found that supports nSamples " + nSamples + " ploidy " + ploidy + " and maxAltAlleles " + maxAltAlleles); + } + + /** + * Create a new AFCalc + * + * @param calc the calculation to use + * @param nSamples the number of samples we'll be using + * @param maxAltAlleles the max. alt alleles to consider for SNPs + * @param ploidy the sample ploidy. Must be consistent with the calc + * + * @return an initialized AFCalc + */ + public static AFCalc createAFCalc(final Calculation calc, final int nSamples, final int maxAltAlleles, final int ploidy) { + if ( calc == null ) throw new IllegalArgumentException("Calculation cannot be null"); + if ( nSamples < 0 ) throw new IllegalArgumentException("nSamples must be greater than zero " + nSamples); + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be greater than zero " + maxAltAlleles); + if ( ploidy < 1 ) throw new IllegalArgumentException("sample ploidy must be greater than zero " + ploidy); + + if ( ! calc.usableForParams(ploidy, maxAltAlleles) ) + throw new IllegalArgumentException("AFCalc " + calc + " does not support requested ploidy " + ploidy); + + final Class afClass = getClassByName(calc.className); + if ( afClass == null ) + throw new IllegalArgumentException("Unexpected AFCalc " + calc); + + try { + Object args[] = new Object[]{nSamples, maxAltAlleles, ploidy}; + Constructor c = afClass.getDeclaredConstructor(int.class, int.class, int.class); + return (AFCalc)c.newInstance(args); + } catch (Exception e) { + throw new ReviewedStingException("Could not instantiate AFCalc " + calc, e); + } + } + + protected static List createAFCalcs(final List calcs, final int nSamples, final int maxAltAlleles, final int ploidy) { + final List AFCalcs = new LinkedList(); + + for ( final Calculation calc : calcs ) + AFCalcs.add(createAFCalc(calc, nSamples, maxAltAlleles, ploidy)); + + return AFCalcs; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java new file mode 100644 index 000000000..a65772444 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.*; + +/** + * Describes the results of the AFCalc + * + * Only the bare essentials are represented here, as all AFCalc models must return meaningful results for + * all of these fields. + * + * Note that all of the values -- i.e. priors -- are checked now that they are meaningful, which means + * that users of this code can rely on the values coming out of these functions. + */ +public class AFCalcResult { + private final static int AF0 = 0; + private final static int AF1p = 1; + private final static int LOG_10_ARRAY_SIZES = 2; + + private final double[] log10LikelihoodsOfAC; + private final double[] log10PriorsOfAC; + private final double[] log10PosteriorsOfAC; + + private final Map log10pNonRefByAllele; + + /** + * The AC values for all ALT alleles at the MLE + */ + private final int[] alleleCountsOfMLE; + + int nEvaluations = 0; + + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + */ + public AFCalcResult(final int[] alleleCountsOfMLE, + final int nEvaluations, + final List allelesUsedInGenotyping, + final double[] log10LikelihoodsOfAC, + final double[] log10PriorsOfAC, + final Map log10pNonRefByAllele) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.size() < 1 ) throw new IllegalArgumentException("allelesUsedInGenotyping must be non-null list of at least 1 value " + allelesUsedInGenotyping); + if ( alleleCountsOfMLE == null ) throw new IllegalArgumentException("alleleCountsOfMLE cannot be null"); + if ( alleleCountsOfMLE.length != allelesUsedInGenotyping.size() - 1) throw new IllegalArgumentException("alleleCountsOfMLE.length " + alleleCountsOfMLE.length + " != allelesUsedInGenotyping.size() " + allelesUsedInGenotyping.size()); + if ( nEvaluations < 0 ) throw new IllegalArgumentException("nEvaluations must be >= 0 but saw " + nEvaluations); + if ( log10LikelihoodsOfAC.length != 2 ) throw new IllegalArgumentException("log10LikelihoodsOfAC must have length equal 2"); + if ( log10PriorsOfAC.length != 2 ) throw new IllegalArgumentException("log10PriorsOfAC must have length equal 2"); + if ( log10pNonRefByAllele == null ) throw new IllegalArgumentException("log10pNonRefByAllele cannot be null"); + if ( log10pNonRefByAllele.size() != allelesUsedInGenotyping.size() - 1 ) throw new IllegalArgumentException("log10pNonRefByAllele has the wrong number of elements: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( ! allelesUsedInGenotyping.containsAll(log10pNonRefByAllele.keySet()) ) throw new IllegalArgumentException("log10pNonRefByAllele doesn't contain all of the alleles used in genotyping: log10pNonRefByAllele " + log10pNonRefByAllele + " but allelesUsedInGenotyping " + allelesUsedInGenotyping); + if ( ! MathUtils.goodLog10ProbVector(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES, false) ) throw new IllegalArgumentException("log10LikelihoodsOfAC are bad " + Utils.join(",", log10LikelihoodsOfAC)); + if ( ! MathUtils.goodLog10ProbVector(log10PriorsOfAC, LOG_10_ARRAY_SIZES, true) ) throw new IllegalArgumentException("log10priors are bad " + Utils.join(",", log10PriorsOfAC)); + + this.alleleCountsOfMLE = alleleCountsOfMLE; + this.nEvaluations = nEvaluations; + this.allelesUsedInGenotyping = allelesUsedInGenotyping; + + this.log10LikelihoodsOfAC = Arrays.copyOf(log10LikelihoodsOfAC, LOG_10_ARRAY_SIZES); + this.log10PriorsOfAC = Arrays.copyOf(log10PriorsOfAC, LOG_10_ARRAY_SIZES); + this.log10PosteriorsOfAC = computePosteriors(log10LikelihoodsOfAC, log10PriorsOfAC); + this.log10pNonRefByAllele = new HashMap(log10pNonRefByAllele); + } + + /** + * Return a new AFCalcResult with a new prior probability + * + * @param log10PriorsOfAC + * @return + */ + public AFCalcResult withNewPriors(final double[] log10PriorsOfAC) { + return new AFCalcResult(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + } + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + * + * @return a vector with allele counts, not all of which may be meaningful + */ + @Ensures("result != null") + public int[] getAlleleCountsOfMLE() { + return alleleCountsOfMLE; + } + + /** + * Returns the AC of allele a la #getAlleleCountsOfMLE + * + * @param allele the allele whose AC we want to know. Error if its not in allelesUsedInGenotyping + * @throws IllegalStateException if allele isn't in allelesUsedInGenotyping + * @return the AC of allele + */ + public int getAlleleCountAtMLE(final Allele allele) { + return getAlleleCountsOfMLE()[altAlleleIndex(allele)]; + } + + /** + * Returns the number of cycles used to evaluate the pNonRef for this AF calculation + * + * @return the number of evaluations required to produce the answer for this AF calculation + */ + public int getnEvaluations() { + return nEvaluations; + } + + /** + * Get the list of alleles actually used in genotyping. + * + * Due to computational / implementation constraints this may be smaller than + * the actual list of alleles requested + * + * @return a non-empty list of alleles used during genotyping, the first of which is the reference allele + */ + @Ensures({"result != null", "! result.isEmpty()"}) + public List getAllelesUsedInGenotyping() { + return allelesUsedInGenotyping; + } + + /** + * Get the log10 normalized -- across all ACs -- posterior probability of AC == 0 for all alleles + * + * @return + */ + @Ensures({"MathUtils.goodLog10Probability(result)"}) + public double getLog10PosteriorOfAFEq0() { + return log10PosteriorsOfAC[AF0]; + } + + /** + * Get the log10 normalized -- across all ACs -- posterior probability of AC > 0 for any alleles + * + * @return + */ + @Ensures({"MathUtils.goodLog10Probability(result)"}) + public double getLog10PosteriorOfAFGT0() { + return log10PosteriorsOfAC[AF1p]; + } + + /** + * Get the log10 unnormalized -- across all ACs -- likelihood of AC == 0 for all alleles + * + * @return + */ + @Ensures({"MathUtils.goodLog10Probability(result)"}) + public double getLog10LikelihoodOfAFEq0() { + return log10LikelihoodsOfAC[AF0]; + } + + /** + * Get the log10 unnormalized -- across all ACs -- likelihood of AC > 0 for any alleles + * + * @return + */ + @Ensures({"MathUtils.goodLog10Probability(result)"}) + public double getLog10LikelihoodOfAFGT0() { + return log10LikelihoodsOfAC[AF1p]; + } + + /** + * Get the log10 unnormalized -- across all ACs -- prior probability of AC == 0 for all alleles + * + * @return + */ + @Ensures({"MathUtils.goodLog10Probability(result)"}) + public double getLog10PriorOfAFEq0() { + return log10PriorsOfAC[AF0]; + } + + /** + * Get the log10 unnormalized -- across all ACs -- prior probability of AC > 0 + * + * @return + */ + @Ensures({"MathUtils.goodLog10Probability(result)"}) + public double getLog10PriorOfAFGT0() { + return log10PriorsOfAC[AF1p]; + } + + @Override + public String toString() { + final List byAllele = new LinkedList(); + for ( final Allele a : getAllelesUsedInGenotyping() ) + if ( a.isNonReference() ) byAllele.add(String.format("%s => MLE %d / posterior %.2f", a, getAlleleCountAtMLE(a), getLog10PosteriorOfAFGt0ForAllele(a))); + return String.format("AFCalc%n\t\tlog10PosteriorOfAFGT0=%.2f%n\t\t%s", getLog10LikelihoodOfAFGT0(), Utils.join("\n\t\t", byAllele)); + } + + /** + * Are we sufficiently confidence in being non-ref that the site is considered polymorphic? + * + * We are non-ref if the probability of being non-ref > the emit confidence (often an argument). + * Suppose posterior AF > 0 is log10: -5 => 10^-5 + * And that log10minPNonRef is -3. + * We are considered polymorphic since 10^-5 < 10^-3 => -5 < -3 + * + * @param log10minPNonRef the log10 scaled min pr of being non-ref to be considered polymorphic + * + * @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0 + */ + public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { + return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; + } + + /** + * Are any of the alleles polymorphic w.r.t. #isPolymorphic? + * + * @param log10minPNonRef the confidence threshold, in log10 space + * @return true if any are poly, false otherwise + */ + public boolean anyPolymorphic(final double log10minPNonRef) { + for ( final Allele a : getAllelesUsedInGenotyping() ) + if ( a.isNonReference() && isPolymorphic(a, log10minPNonRef) ) + return true; + return false; + } + + /** + * Returns the log10 probability that allele is segregating + * + * Unlike the sites-level annotation, this calculation is specific to allele, and can be + * used to separately determine how much evidence there is that allele is independently + * segregating as opposed to the site being polymorphic with any allele. In the bi-allelic + * case these are obviously the same but for multiple alt alleles there can be lots of + * evidence for one allele but not so much for any other allele + * + * @param allele the allele we're interested in, must be in getAllelesUsedInGenotyping + * @return the log10 probability that allele is segregating at this site + */ + @Ensures("MathUtils.goodLog10Probability(result)") + public double getLog10PosteriorOfAFGt0ForAllele(final Allele allele) { + final Double log10pNonRef = log10pNonRefByAllele.get(allele); + if ( log10pNonRef == null ) throw new IllegalArgumentException("Unknown allele " + allele); + return log10pNonRef; + } + + /** + * Returns the log10 normalized posteriors given the log10 likelihoods and priors + * + * @param log10LikelihoodsOfAC + * @param log10PriorsOfAC + * + * @return freshly allocated log10 normalized posteriors vector + */ + @Requires("log10LikelihoodsOfAC.length == log10PriorsOfAC.length") + @Ensures("MathUtils.goodLog10ProbVector(result, LOG_10_ARRAY_SIZES, true)") + private static double[] computePosteriors(final double[] log10LikelihoodsOfAC, final double[] log10PriorsOfAC) { + final double[] log10UnnormalizedPosteriors = new double[log10LikelihoodsOfAC.length]; + for ( int i = 0; i < log10LikelihoodsOfAC.length; i++ ) + log10UnnormalizedPosteriors[i] = log10LikelihoodsOfAC[i] + log10PriorsOfAC[i]; + return MathUtils.normalizeFromLog10(log10UnnormalizedPosteriors, true, false); + } + + /** + * Computes the offset into linear vectors indexed by alt allele for allele + * + * Things like our MLE allele count vector are indexed by alt allele index, with + * the first alt allele being 0, the second 1, etc. This function computes the index + * associated with allele. + * + * @param allele the allele whose alt index we'd like to know + * @throws IllegalArgumentException if allele isn't in allelesUsedInGenotyping + * @return an index value greater than 0 suitable for indexing into the MLE and other alt allele indexed arrays + */ + @Requires("allele != null") + @Ensures({"result >= 0", "result < allelesUsedInGenotyping.size() - 1"}) + private int altAlleleIndex(final Allele allele) { + if ( allele.isReference() ) throw new IllegalArgumentException("Cannot get the alt allele index for reference allele " + allele); + final int index = allelesUsedInGenotyping.indexOf(allele); + if ( index == -1 ) + throw new IllegalArgumentException("could not find allele " + allele + " in " + allelesUsedInGenotyping); + else + return index - 1; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java new file mode 100755 index 000000000..4895c84d9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/DiploidExactAFCalc.java @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +public abstract class DiploidExactAFCalc extends ExactAFCalc { + public DiploidExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + if ( ploidy != 2 ) throw new IllegalArgumentException("ploidy must be two for DiploidExactAFCalc and subclasses but saw " + ploidy); + } + + @Override + protected AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final int numAlternateAlleles = vc.getNAlleles() - 1; + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + // queue of AC conformations to process + final LinkedList ACqueue = new LinkedList(); + + // mapping of ExactACset indexes to the objects + final HashMap indexesToACset = new HashMap(numChr+1); + + // add AC=0 to the queue + final int[] zeroCounts = new int[numAlternateAlleles]; + ExactACset zeroSet = new ExactACset(numSamples+1, new ExactACcounts(zeroCounts)); + ACqueue.add(zeroSet); + indexesToACset.put(zeroSet.getACcounts(), zeroSet); + + while ( !ACqueue.isEmpty() ) { + getStateTracker().incNEvaluations(); // keep track of the number of evaluations + + // compute log10Likelihoods + final ExactACset set = ACqueue.remove(); + + final double log10LofKs = calculateAlleleCountConformation(set, genotypeLikelihoods, numChr, ACqueue, indexesToACset, log10AlleleFrequencyPriors); + + // clean up memory + indexesToACset.remove(set.getACcounts()); + //if ( DEBUG ) + // System.out.printf(" *** removing used set=%s%n", set.ACcounts); + } + + return getResultFromFinalState(vc, log10AlleleFrequencyPriors); + } + + @Override + protected VariantContext reduceScope(final VariantContext vc) { + // don't try to genotype too many alternate alleles + if ( vc.getAlternateAlleles().size() > getMaxAltAlleles() ) { + logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); + + VariantContextBuilder builder = new VariantContextBuilder(vc); + List alleles = new ArrayList(getMaxAltAlleles() + 1); + alleles.add(vc.getReference()); + alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles())); + builder.alleles(alleles); + builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + return builder.make(); + } else { + return vc; + } + } + + private static final int PL_INDEX_OF_HOM_REF = 0; + private static List chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose) { + final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); + final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; + for ( int i = 0; i < numOriginalAltAlleles; i++ ) + likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); + + // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype + final ArrayList GLs = getGLs(vc.getGenotypes(), true); + for ( final double[] likelihoods : GLs ) { + final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); + if ( PLindexOfBestGL != PL_INDEX_OF_HOM_REF ) { + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindexOfBestGL); + if ( alleles.alleleIndex1 != 0 ) + likelihoodSums[alleles.alleleIndex1-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + // don't double-count it + if ( alleles.alleleIndex2 != 0 && alleles.alleleIndex2 != alleles.alleleIndex1 ) + likelihoodSums[alleles.alleleIndex2-1].sum += likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]; + } + } + + // sort them by probability mass and choose the best ones + Collections.sort(Arrays.asList(likelihoodSums)); + final ArrayList bestAlleles = new ArrayList(numAllelesToChoose); + for ( int i = 0; i < numAllelesToChoose; i++ ) + bestAlleles.add(likelihoodSums[i].allele); + + final ArrayList orderedBestAlleles = new ArrayList(numAllelesToChoose); + for ( Allele allele : vc.getAlternateAlleles() ) { + if ( bestAlleles.contains(allele) ) + orderedBestAlleles.add(allele); + } + + return orderedBestAlleles; + } + + private static final class DependentSet { + public final int[] ACcounts; + public final int PLindex; + + public DependentSet(final int[] ACcounts, final int PLindex) { + this.ACcounts = ACcounts; + this.PLindex = PLindex; + } + } + + private double calculateAlleleCountConformation(final ExactACset set, + final ArrayList genotypeLikelihoods, + final int numChr, + final LinkedList ACqueue, + final HashMap indexesToACset, + final double[] log10AlleleFrequencyPriors) { + + //if ( DEBUG ) + // System.out.printf(" *** computing LofK for set=%s%n", set.ACcounts); + + // compute the log10Likelihoods + computeLofK(set, genotypeLikelihoods, log10AlleleFrequencyPriors); + + final double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + + // can we abort early because the log10Likelihoods are so small? + if ( getStateTracker().abort(log10LofK, set.getACcounts(), true) ) { + //if ( DEBUG ) + // System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); + return log10LofK; + } + + // iterate over higher frequencies if possible + final int ACwiggle = numChr - set.getACsum(); + if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies + return log10LofK; + + final int numAltAlleles = set.getACcounts().getCounts().length; + + // add conformations for the k+1 case + for ( int allele = 0; allele < numAltAlleles; allele++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele]++; + // to get to this conformation, a sample would need to be AB (remember that ref=0) + final int PLindex = GenotypeLikelihoods.calculatePLindex(0, allele+1); + updateACset(ACcountsClone, numChr, set, PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + // add conformations for the k+2 case if it makes sense; note that the 2 new alleles may be the same or different + if ( ACwiggle > 1 ) { + final ArrayList differentAlleles = new ArrayList(numAltAlleles * numAltAlleles); + final ArrayList sameAlleles = new ArrayList(numAltAlleles); + + for ( int allele_i = 0; allele_i < numAltAlleles; allele_i++ ) { + for ( int allele_j = allele_i; allele_j < numAltAlleles; allele_j++ ) { + final int[] ACcountsClone = set.getACcounts().getCounts().clone(); + ACcountsClone[allele_i]++; + ACcountsClone[allele_j]++; + + // to get to this conformation, a sample would need to be BB or BC (remember that ref=0, so add one to the index) + final int PLindex = GenotypeLikelihoods.calculatePLindex(allele_i+1, allele_j+1); + if ( allele_i == allele_j ) + sameAlleles.add(new DependentSet(ACcountsClone, PLindex)); + else + differentAlleles.add(new DependentSet(ACcountsClone, PLindex)); + } + } + + // IMPORTANT: we must first add the cases where the 2 new alleles are different so that the queue maintains its ordering + for ( DependentSet dependent : differentAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + for ( DependentSet dependent : sameAlleles ) + updateACset(dependent.ACcounts, numChr, set, dependent.PLindex, ACqueue, indexesToACset, genotypeLikelihoods); + } + + return log10LofK; + } + + // adds the ExactACset represented by the ACcounts to the ACqueue if not already there (creating it if needed) and + // also pushes its value to the given callingSetIndex. + private void updateACset(final int[] newSetCounts, + final int numChr, + final ExactACset dependentSet, + final int PLsetIndex, + final Queue ACqueue, + final HashMap indexesToACset, + final ArrayList genotypeLikelihoods) { + final ExactACcounts index = new ExactACcounts(newSetCounts); + if ( !indexesToACset.containsKey(index) ) { + ExactACset set = new ExactACset(numChr/2 +1, index); + indexesToACset.put(index, set); + ACqueue.add(set); + } + + // push data from the dependency to the new set + //if ( DEBUG ) + // System.out.println(" *** pushing data from " + index + " to " + dependencySet.ACcounts); + pushData(indexesToACset.get(index), dependentSet, PLsetIndex, genotypeLikelihoods); + } + + private void computeLofK(final ExactACset set, + final ArrayList genotypeLikelihoods, + final double[] log10AlleleFrequencyPriors) { + + set.getLog10Likelihoods()[0] = 0.0; // the zero case + final int totalK = set.getACsum(); + + // special case for k = 0 over all k + if ( totalK == 0 ) { + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j-1] + genotypeLikelihoods.get(j)[HOM_REF_INDEX]; + + final double log10Lof0 = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0); + getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); + return; + } + + // if we got here, then k > 0 for at least one k. + // the non-AA possible conformations were already dealt with by pushes from dependent sets; + // now deal with the AA case (which depends on previous cells in this column) and then update the L(j,k) value + for ( int j = 1; j < set.getLog10Likelihoods().length; j++ ) { + + if ( totalK < 2*j-1 ) { + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = MathUtils.log10Cache[2*j-totalK] + MathUtils.log10Cache[2*j-totalK-1] + set.getLog10Likelihoods()[j-1] + gl[HOM_REF_INDEX]; + set.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[j], conformationValue); + } + + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + set.getLog10Likelihoods()[j] = set.getLog10Likelihoods()[j] - logDenominator; + } + + double log10LofK = set.getLog10Likelihoods()[set.getLog10Likelihoods().length-1]; + + // update the MLE if necessary + getStateTracker().updateMLEifNeeded(log10LofK, set.getACcounts().getCounts()); + + // apply the priors over each alternate allele + for ( final int ACcount : set.getACcounts().getCounts() ) { + if ( ACcount > 0 ) + log10LofK += log10AlleleFrequencyPriors[ACcount]; + } + + getStateTracker().updateMAPifNeeded(log10LofK, set.getACcounts().getCounts()); + } + + private void pushData(final ExactACset targetSet, + final ExactACset dependentSet, + final int PLsetIndex, + final ArrayList genotypeLikelihoods) { + final int totalK = targetSet.getACsum(); + + for ( int j = 1; j < targetSet.getLog10Likelihoods().length; j++ ) { + + if ( totalK <= 2*j ) { // skip impossible conformations + final double[] gl = genotypeLikelihoods.get(j); + final double conformationValue = + determineCoefficient(PLsetIndex, j, targetSet.getACcounts().getCounts(), totalK) + dependentSet.getLog10Likelihoods()[j-1] + gl[PLsetIndex]; + targetSet.getLog10Likelihoods()[j] = MathUtils.approximateLog10SumLog10(targetSet.getLog10Likelihoods()[j], conformationValue); + } + } + } + + private double determineCoefficient(int PLindex, final int j, final int[] ACcounts, final int totalK) { + // the closed form representation generalized for multiple alleles is as follows: + // AA: (2j - totalK) * (2j - totalK - 1) + // AB: 2k_b * (2j - totalK) + // AC: 2k_c * (2j - totalK) + // BB: k_b * (k_b - 1) + // BC: 2 * k_b * k_c + // CC: k_c * (k_c - 1) + + // find the 2 alleles that are represented by this PL index + GenotypeLikelihoods.GenotypeLikelihoodsAllelePair alleles = GenotypeLikelihoods.getAllelePair(PLindex); + + // *** note that throughout this method we subtract one from the alleleIndex because ACcounts *** + // *** doesn't consider the reference allele whereas the GenotypeLikelihoods PL cache does. *** + + // the AX het case + if ( alleles.alleleIndex1 == 0 ) + return MathUtils.log10Cache[2*ACcounts[alleles.alleleIndex2-1]] + MathUtils.log10Cache[2*j-totalK]; + + final int k_i = ACcounts[alleles.alleleIndex1-1]; + + // the hom var case (e.g. BB, CC, DD) + final double coeff; + if ( alleles.alleleIndex1 == alleles.alleleIndex2 ) { + coeff = MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_i - 1]; + } + // the het non-ref case (e.g. BC, BD, CD) + else { + final int k_j = ACcounts[alleles.alleleIndex2-1]; + coeff = MathUtils.log10Cache[2] + MathUtils.log10Cache[k_i] + MathUtils.log10Cache[k_j]; + } + + return coeff; + } + + public GenotypesContext subsetAlleles(final VariantContext vc, + final List allelesToUse, + final boolean assignGenotypes, + final int ploidy) { + return VariantContextUtils.subsetDiploidAlleles(vc, allelesToUse, assignGenotypes); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java new file mode 100644 index 000000000..af6d46eb8 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACcounts.java @@ -0,0 +1,46 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import java.util.Arrays; + +/** +* Created with IntelliJ IDEA. +* User: depristo +* Date: 10/5/12 +* Time: 2:54 PM +* To change this template use File | Settings | File Templates. +*/ // a wrapper around the int array so that we can make it hashable +public final class ExactACcounts { + private final int[] counts; + private int hashcode = -1; + + public ExactACcounts(final int[] counts) { + this.counts = counts; + } + + public int[] getCounts() { + return counts; + } + + @Override + public boolean equals(Object obj) { + return (obj instanceof ExactACcounts) && Arrays.equals(getCounts(), ((ExactACcounts) obj).getCounts()); + } + + @Override + public int hashCode() { + if ( hashcode == -1 ) + hashcode = Arrays.hashCode(getCounts()); + return hashcode; + } + + @Override + public String toString() { + StringBuffer sb = new StringBuffer(); + sb.append(getCounts()[0]); + for ( int i = 1; i < getCounts().length; i++ ) { + sb.append("/"); + sb.append(getCounts()[i]); + } + return sb.toString(); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java new file mode 100644 index 000000000..de5bad57f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactACset.java @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; + +import java.util.Arrays; + +/** +* Created with IntelliJ IDEA. +* User: depristo +* Date: 10/5/12 +* Time: 2:53 PM +* To change this template use File | Settings | File Templates. +*/ // This class represents a column in the Exact AC calculation matrix +public final class ExactACset { + // the counts of the various alternate alleles which this column represents + private final ExactACcounts ACcounts; + + // the column of the matrix + private final double[] log10Likelihoods; + + int sum = -1; + + public ExactACset(final int size, final ExactACcounts ACcounts) { + this.ACcounts = ACcounts; + log10Likelihoods = new double[size]; + Arrays.fill(log10Likelihoods, Double.NEGATIVE_INFINITY); + } + + /** + * sum of all the non-reference alleles + */ + public int getACsum() { + if ( sum == -1 ) + sum = (int)MathUtils.sum(getACcounts().getCounts()); + return sum; + } + + public boolean equals(Object obj) { + return (obj instanceof ExactACset) && getACcounts().equals(((ExactACset)obj).getACcounts()); + } + + public ExactACcounts getACcounts() { + return ACcounts; + } + + public double[] getLog10Likelihoods() { + return log10Likelihoods; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java new file mode 100755 index 000000000..ab230d398 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactAFCalc.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.Genotype; +import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; +import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; + +import java.util.ArrayList; + +/** + * Uses the Exact calculation of Heng Li + */ +abstract class ExactAFCalc extends AFCalc { + protected static final int HOM_REF_INDEX = 0; // AA likelihoods are always first + + protected ExactAFCalc(final int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + } + + /** + * Wrapper class that compares two likelihoods associated with two alleles + */ + protected static final class LikelihoodSum implements Comparable { + public double sum = 0.0; + public Allele allele; + + public LikelihoodSum(Allele allele) { this.allele = allele; } + + public int compareTo(LikelihoodSum other) { + final double diff = sum - other.sum; + return ( diff < 0.0 ) ? 1 : (diff > 0.0 ) ? -1 : 0; + } + } + + /** + * Unpack GenotypesContext into arraylist of doubel values + * @param GLs Input genotype context + * @return ArrayList of doubles corresponding to GL vectors + */ + protected static ArrayList getGLs(final GenotypesContext GLs, final boolean includeDummy) { + ArrayList genotypeLikelihoods = new ArrayList(GLs.size() + 1); + + if ( includeDummy ) genotypeLikelihoods.add(new double[]{0.0,0.0,0.0}); // dummy + for ( Genotype sample : GLs.iterateInSampleNameOrder() ) { + if ( sample.hasLikelihoods() ) { + double[] gls = sample.getLikelihoods().getAsVector(); + + if ( MathUtils.sum(gls) < VariantContextUtils.SUM_GL_THRESH_NOCALL ) + genotypeLikelihoods.add(gls); + } + } + + return genotypeLikelihoods; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java new file mode 100644 index 000000000..f13fe4429 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ExactCallLogger.java @@ -0,0 +1,179 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Requires; +import org.apache.commons.lang.ArrayUtils; +import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.io.*; +import java.util.*; + +/** + * Allows us to write out and read in information about exact calls (site, alleles, PLs, etc) in tabular format + * + * Once opened, calls can be writen to disk with printCallInfo + */ +public class ExactCallLogger implements Cloneable { + private PrintStream callReport = null; + + /** + * Create a new ExactCallLogger writing it's output to outputFile + * + * @param outputFile + */ + public ExactCallLogger(final File outputFile) { + try { + callReport = new PrintStream(new BufferedOutputStream(new FileOutputStream(outputFile), 10000000)); + callReport.println(Utils.join("\t", Arrays.asList("loc", "variable", "key", "value"))); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(outputFile, e); + } + } + + /** + * Summarizes information about an exact call that happened + */ + public static class ExactCall { + final VariantContext vc; + final long runtime; + final AFCalcResult originalCall; + + public ExactCall(VariantContext vc, final long runtime, final AFCalcResult originalCall) { + this.vc = vc; + this.runtime = runtime; + this.originalCall = originalCall; + } + + @Override + public String toString() { + return String.format("ExactCall %s:%d alleles=%s nSamples=%s orig.pNonRef=%.2f orig.runtime=%s", + vc.getChr(), vc.getStart(), vc.getAlleles(), vc.getNSamples(), + originalCall.getLog10PosteriorOfAFGT0(), + new AutoFormattingTime(runtime / 1e9).toString()); + } + } + + protected final void printCallInfo(final VariantContext vc, + final double[] log10AlleleFrequencyPriors, + final long runtimeNano, + final AFCalcResult result) { + printCallElement(vc, "type", "ignore", vc.getType()); + + int allelei = 0; + for (final Allele a : vc.getAlleles()) + printCallElement(vc, "allele", allelei++, a.getDisplayString()); + + for (final Genotype g : vc.getGenotypes()) + printCallElement(vc, "PL", g.getSampleName(), g.getLikelihoodsString()); + + for (int priorI = 0; priorI < log10AlleleFrequencyPriors.length; priorI++) + printCallElement(vc, "priorI", priorI, log10AlleleFrequencyPriors[priorI]); + + printCallElement(vc, "runtime.nano", "ignore", runtimeNano); + printCallElement(vc, "log10PosteriorOfAFEq0", "ignore", result.getLog10PosteriorOfAFEq0()); + printCallElement(vc, "log10PosteriorOfAFGt0", "ignore", result.getLog10PosteriorOfAFGT0()); + + for ( final Allele allele : result.getAllelesUsedInGenotyping() ) { + if ( allele.isNonReference() ) { + printCallElement(vc, "MLE", allele, result.getAlleleCountAtMLE(allele)); + printCallElement(vc, "pNonRefByAllele", allele, result.getLog10PosteriorOfAFGt0ForAllele(allele)); + } + } + + callReport.flush(); + } + + @Requires({"vc != null", "variable != null", "key != null", "value != null", "callReport != null"}) + private void printCallElement(final VariantContext vc, + final Object variable, + final Object key, + final Object value) { + final String loc = String.format("%s:%d", vc.getChr(), vc.getStart()); + callReport.println(Utils.join("\t", Arrays.asList(loc, variable, key, value))); + } + + /** + * Read in a list of ExactCall objects from reader, keeping only those + * with starts in startsToKeep or all sites (if this is empty) + * + * @param reader a just-opened reader sitting at the start of the file + * @param startsToKeep a list of start position of the calls to keep, or empty if all calls should be kept + * @param parser a genome loc parser to create genome locs + * @return a list of ExactCall objects in reader + * @throws IOException + */ + public static List readExactLog(final BufferedReader reader, final List startsToKeep, GenomeLocParser parser) throws IOException { + if ( reader == null ) throw new IllegalArgumentException("reader cannot be null"); + if ( startsToKeep == null ) throw new IllegalArgumentException("startsToKeep cannot be null"); + if ( parser == null ) throw new IllegalArgumentException("GenomeLocParser cannot be null"); + + List calls = new LinkedList(); + + // skip the header line + reader.readLine(); + + // skip the first "type" line + reader.readLine(); + + while (true) { + final VariantContextBuilder builder = new VariantContextBuilder(); + final List alleles = new ArrayList(); + final List genotypes = new ArrayList(); + final double[] posteriors = new double[2]; + final double[] priors = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true); + final List mle = new ArrayList(); + final Map log10pNonRefByAllele = new HashMap(); + long runtimeNano = -1; + + GenomeLoc currentLoc = null; + while (true) { + final String line = reader.readLine(); + if (line == null) + return calls; + + final String[] parts = line.split("\t"); + final GenomeLoc lineLoc = parser.parseGenomeLoc(parts[0]); + final String variable = parts[1]; + final String key = parts[2]; + final String value = parts[3]; + + if (currentLoc == null) + currentLoc = lineLoc; + + if (variable.equals("type")) { + if (startsToKeep.isEmpty() || startsToKeep.contains(currentLoc.getStart())) { + builder.alleles(alleles); + final int stop = currentLoc.getStart() + alleles.get(0).length() - 1; + builder.chr(currentLoc.getContig()).start(currentLoc.getStart()).stop(stop); + builder.genotypes(genotypes); + final int[] mleInts = ArrayUtils.toPrimitive(mle.toArray(new Integer[]{})); + final AFCalcResult result = new AFCalcResult(mleInts, 1, alleles, posteriors, priors, log10pNonRefByAllele); + calls.add(new ExactCall(builder.make(), runtimeNano, result)); + } + break; + } else if (variable.equals("allele")) { + final boolean isRef = key.equals("0"); + alleles.add(Allele.create(value, isRef)); + } else if (variable.equals("PL")) { + final GenotypeBuilder gb = new GenotypeBuilder(key); + gb.PL(GenotypeLikelihoods.fromPLField(value).getAsPLs()); + genotypes.add(gb.make()); + } else if (variable.equals("log10PosteriorOfAFEq0")) { + posteriors[0] = Double.valueOf(value); + } else if (variable.equals("log10PosteriorOfAFGt0")) { + posteriors[1] = Double.valueOf(value); + } else if (variable.equals("MLE")) { + mle.add(Integer.valueOf(value)); + } else if (variable.equals("pNonRefByAllele")) { + final Allele a = Allele.create(key); + log10pNonRefByAllele.put(a, Double.valueOf(value)); + } else if (variable.equals("runtime.nano")) { + runtimeNano = Long.valueOf(value); + } else { + // nothing to do + } + } + } + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java new file mode 100755 index 000000000..d0b801a20 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/IndependentAllelesDiploidExactAFCalc.java @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2010. + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.*; + +import java.util.*; + +/** + * Computes the conditional bi-allelic exact results + * + * Suppose vc contains 2 alt allele: A* with C and T. This function first computes: + * + * (1) P(D | AF_c > 0 && AF_t == *) [i.e., T can be anything] + * + * it then computes the conditional probability on AF_c == 0: + * + * (2) P(D | AF_t > 0 && AF_c == 0) + * + * Thinking about this visually, we have the following likelihood matrix where each cell is + * the P(D | AF_c == i && AF_t == j): + * + * 0 AF_c > 0 + * ----------------- + * 0 | | + * |--|------------- + * a | | + * f | | + * _ | | + * t | | + * > | | + * 0 | | + * + * What we really want to know how + * + * (3) P(D | AF_c == 0 & AF_t == 0) + * + * compares with + * + * (4) P(D | AF_c > 0 || AF_t > 0) + * + * This is effectively asking for the value in the upper left vs. the sum of all cells. + * + * This class implements the conditional likelihoods summation for any number of alt + * alleles, where each alt allele has its EXACT probability of segregating calculated by + * reducing each alt B into the case XB and computing P(D | AF_b > 0 ) as follows: + * + * Suppose we have for a A/B/C site the following GLs: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B + * + * XX = AA + AC + CC (since X = A or C) + * XB = AB + BC + * BB = BB + * + * After each allele has its probability calculated we compute the joint posterior + * as P(D | AF_* == 0) = prod_i P (D | AF_i == 0), after applying the theta^i + * prior for the ith least likely allele. + */ + public class IndependentAllelesDiploidExactAFCalc extends DiploidExactAFCalc { + /** + * The min. confidence of an allele to be included in the joint posterior. + */ + private final static double MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR = Math.log10(1e-10); + + private final static int[] BIALLELIC_NON_INFORMATIVE_PLS = new int[]{0,0,0}; + private final static List BIALLELIC_NOCALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + + /** + * Sorts AFCalcResults by their posteriors of AF > 0, so the + */ + private final static class CompareAFCalcResultsByPNonRef implements Comparator { + @Override + public int compare(AFCalcResult o1, AFCalcResult o2) { + return -1 * Double.compare(o1.getLog10PosteriorOfAFGT0(), o2.getLog10PosteriorOfAFGT0()); + } + } + + private final static CompareAFCalcResultsByPNonRef compareAFCalcResultsByPNonRef = new CompareAFCalcResultsByPNonRef(); + + /** + * The AFCalc model we are using to do the bi-allelic computation + */ + final AFCalc biAlleleExactModel; + + protected IndependentAllelesDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + biAlleleExactModel = new ReferenceDiploidExactAFCalc(nSamples, 1, ploidy); + } + + /** + * Trivial subclass that helps with debugging by keeping track of the supporting information for this joint call + */ + private static class MyAFCalcResult extends AFCalcResult { + /** + * List of the supporting bi-allelic AFCalcResults that went into making this multi-allelic joint call + */ + final List supporting; + + private MyAFCalcResult(int[] alleleCountsOfMLE, int nEvaluations, List allelesUsedInGenotyping, double[] log10LikelihoodsOfAC, double[] log10PriorsOfAC, Map log10pNonRefByAllele, List supporting) { + super(alleleCountsOfMLE, nEvaluations, allelesUsedInGenotyping, log10LikelihoodsOfAC, log10PriorsOfAC, log10pNonRefByAllele); + this.supporting = supporting; + } + } + + @Override + public AFCalcResult computeLog10PNonRef(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final List independentResultTrackers = computeAlleleIndependentExact(vc, log10AlleleFrequencyPriors); + final List withMultiAllelicPriors = applyMultiAllelicPriors(independentResultTrackers); + return combineIndependentPNonRefs(vc, withMultiAllelicPriors); + } + + + /** + * Compute the conditional exact AFCalcResult for each allele in vc independently, returning + * the result of each, in order of the alt alleles in VC + * + * @param vc the VariantContext we want to analyze + * @param log10AlleleFrequencyPriors the priors + * @return a list of the AFCalcResults for each bi-allelic sub context of vc + */ + @Requires({"vc != null", "log10AlleleFrequencyPriors != null"}) + @Ensures("goodIndependentResult(vc, result)") + protected final List computeAlleleIndependentExact(final VariantContext vc, + final double[] log10AlleleFrequencyPriors) { + final List results = new LinkedList(); + + for ( final VariantContext subvc : makeAlleleConditionalContexts(vc) ) { + final AFCalcResult resultTracker = biAlleleExactModel.getLog10PNonRef(subvc, log10AlleleFrequencyPriors); + results.add(resultTracker); + } + + return results; + } + + /** + * Helper function to ensure that the computeAlleleIndependentExact is returning reasonable results + */ + private static boolean goodIndependentResult(final VariantContext vc, final List results) { + if ( results.size() != vc.getNAlleles() - 1) return false; + for ( int i = 0; i < results.size(); i++ ) { + if ( results.get(i).getAllelesUsedInGenotyping().size() != 2 ) + return false; + if ( ! results.get(i).getAllelesUsedInGenotyping().contains(vc.getAlternateAllele(i)) ) + return false; + } + + return true; + } + + /** + * Returns the bi-allelic variant context for each alt allele in vc with bi-allelic likelihoods, in order + * + * @param vc the variant context to split. Must have n.alt.alleles > 1 + * @return a bi-allelic variant context for each alt allele in vc + */ + @Requires({"vc != null", "vc.getNAlleles() > 1"}) + @Ensures("result.size() == vc.getNAlleles() - 1") + protected final List makeAlleleConditionalContexts(final VariantContext vc) { + final int nAltAlleles = vc.getNAlleles() - 1; + final List vcs = new LinkedList(); + + for ( int altI = 0; altI < nAltAlleles; altI++ ) { + vcs.add(biallelicCombinedGLs(vc, altI + 1)); + } + + return vcs; + } + + /** + * Create a single bi-allelic variant context from rootVC with alt allele with index altAlleleIndex + * + * @param rootVC the root (potentially multi-allelic) variant context + * @param altAlleleIndex index of the alt allele, from 0 == first alt allele + * @return a bi-allelic variant context based on rootVC + */ + @Requires({"rootVC.getNAlleles() > 1", "altAlleleIndex < rootVC.getNAlleles()"}) + @Ensures({"result.isBiallelic()"}) + protected final VariantContext biallelicCombinedGLs(final VariantContext rootVC, final int altAlleleIndex) { + if ( rootVC.isBiallelic() ) { + return rootVC; + } else { + final int nAlts = rootVC.getNAlleles() - 1; + final List biallelicGenotypes = new ArrayList(rootVC.getNSamples()); + for ( final Genotype g : rootVC.getGenotypes() ) + biallelicGenotypes.add(combineGLs(g, altAlleleIndex, nAlts)); + + final VariantContextBuilder vcb = new VariantContextBuilder(rootVC); + final Allele altAllele = rootVC.getAlternateAllele(altAlleleIndex - 1); + vcb.alleles(Arrays.asList(rootVC.getReference(), altAllele)); + vcb.genotypes(biallelicGenotypes); + return vcb.make(); + } + } + + /** + * Returns a new Genotype with the PLs of the multi-allelic original reduced to a bi-allelic case + * + * This is handled in the following way: + * + * Suppose we have for a A/B/C site the following GLs: + * + * AA AB BB AC BC CC + * + * and we want to get the bi-allelic GLs for X/B, where X is everything not B + * + * XX = AA + AC + CC (since X = A or C) + * XB = AB + BC + * BB = BB + * + * @param original the original multi-allelic genotype + * @param altIndex the index of the alt allele we wish to keep in the bialleic case -- with ref == 0 + * @param nAlts the total number of alt alleles + * @return a new biallelic genotype with appropriate PLs + */ + @Requires({"original.hasLikelihoods()"}) // TODO -- add ploidy == 2 test "original.getPLs() == null || original.getPLs().length == 3"}) + @Ensures({"result.hasLikelihoods()", "result.getPL().length == 3"}) + protected Genotype combineGLs(final Genotype original, final int altIndex, final int nAlts ) { + if ( original.isNonInformative() ) + return new GenotypeBuilder(original).PL(BIALLELIC_NON_INFORMATIVE_PLS).alleles(BIALLELIC_NOCALL).make(); + + if ( altIndex < 1 || altIndex > nAlts ) throw new IllegalStateException("altIndex must be between 1 and nAlts " + nAlts); + + final double[] normalizedPr = MathUtils.normalizeFromLog10(GenotypeLikelihoods.fromPLs(original.getPL()).getAsVector()); + final double[] biAllelicPr = new double[3]; + + for ( int index = 0; index < normalizedPr.length; index++ ) { + final GenotypeLikelihoods.GenotypeLikelihoodsAllelePair pair = GenotypeLikelihoods.getAllelePair(index); + + if ( pair.alleleIndex1 == altIndex ) { + if ( pair.alleleIndex2 == altIndex ) + // hom-alt case + biAllelicPr[2] = normalizedPr[index]; + else + // het-alt case + biAllelicPr[1] += normalizedPr[index]; + } else { + if ( pair.alleleIndex2 == altIndex ) + // het-alt case + biAllelicPr[1] += normalizedPr[index]; + else + // hom-non-alt + biAllelicPr[0] += normalizedPr[index]; + } + } + + final double[] GLs = new double[3]; + for ( int i = 0; i < GLs.length; i++ ) GLs[i] = Math.log10(biAllelicPr[i]); + + return new GenotypeBuilder(original).PL(GLs).alleles(BIALLELIC_NOCALL).make(); + } + + protected final List applyMultiAllelicPriors(final List conditionalPNonRefResults) { + final ArrayList sorted = new ArrayList(conditionalPNonRefResults); + + // sort the results, so the most likely allele is first + Collections.sort(sorted, compareAFCalcResultsByPNonRef); + + double lastPosteriorGt0 = sorted.get(0).getLog10PosteriorOfAFGT0(); + final double log10SingleAllelePriorOfAFGt0 = conditionalPNonRefResults.get(0).getLog10PriorOfAFGT0(); + + for ( int i = 0; i < sorted.size(); i++ ) { + if ( sorted.get(i).getLog10PosteriorOfAFGT0() > lastPosteriorGt0 ) + throw new IllegalStateException("pNonRefResults not sorted: lastPosteriorGt0 " + lastPosteriorGt0 + " but current is " + sorted.get(i).getLog10PosteriorOfAFGT0()); + + final double log10PriorAFGt0 = (i + 1) * log10SingleAllelePriorOfAFGt0; + final double log10PriorAFEq0 = Math.log10(1 - Math.pow(10, log10PriorAFGt0)); + final double[] thetaTONPriors = new double[] { log10PriorAFEq0, log10PriorAFGt0 }; + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + sorted.set(i, sorted.get(i).withNewPriors(MathUtils.normalizeFromLog10(thetaTONPriors, true))); + } + + return sorted; + } + + + /** + * Take the independent estimates of pNonRef for each alt allele and combine them into a single result + * + * Given n independent calculations for each of n alternate alleles create a single + * combined AFCalcResult with: + * + * priors for AF == 0 equal to theta^N for the nth least likely allele + * posteriors that reflect the combined chance that any alleles are segregating and corresponding + * likelihoods + * combined MLEs in the order of the alt alleles in vc + * + * @param sortedResultsWithThetaNPriors the pNonRef result for each allele independently + */ + protected AFCalcResult combineIndependentPNonRefs(final VariantContext vc, + final List sortedResultsWithThetaNPriors) { + int nEvaluations = 0; + final int nAltAlleles = sortedResultsWithThetaNPriors.size(); + final int[] alleleCountsOfMLE = new int[nAltAlleles]; + final double[] log10PriorsOfAC = new double[2]; + final Map log10pNonRefByAllele = new HashMap(nAltAlleles); + + // the sum of the log10 posteriors for AF == 0 and AF > 0 to determine joint probs + double log10PosteriorOfACEq0Sum = 0.0; + double log10PosteriorOfACGt0Sum = 0.0; + + boolean anyPoly = false; + for ( final AFCalcResult sortedResultWithThetaNPriors : sortedResultsWithThetaNPriors ) { + final Allele altAllele = sortedResultWithThetaNPriors.getAllelesUsedInGenotyping().get(1); + final int altI = vc.getAlleles().indexOf(altAllele) - 1; + + // MLE of altI allele is simply the MLE of this allele in altAlleles + alleleCountsOfMLE[altI] = sortedResultWithThetaNPriors.getAlleleCountAtMLE(altAllele); + + // the AF > 0 case requires us to store the normalized likelihood for later summation + if ( sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0() > MIN_LOG10_CONFIDENCE_TO_INCLUDE_ALLELE_IN_POSTERIOR ) { + anyPoly = true; + log10PosteriorOfACEq0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFEq0(); + log10PriorsOfAC[0] += sortedResultWithThetaNPriors.getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] += sortedResultWithThetaNPriors.getLog10PriorOfAFGT0(); + } + + log10PosteriorOfACGt0Sum += sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0(); + + // bind pNonRef for allele to the posterior value of the AF > 0 with the new adjusted prior + log10pNonRefByAllele.put(altAllele, sortedResultWithThetaNPriors.getLog10PosteriorOfAFGT0()); + + // trivial -- update the number of evaluations + nEvaluations += sortedResultWithThetaNPriors.nEvaluations; + } + + // If no alleles were polymorphic, make sure we have the proper priors (the defaults) for likelihood calculation + if ( ! anyPoly ) { + log10PriorsOfAC[0] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFEq0(); + log10PriorsOfAC[1] = sortedResultsWithThetaNPriors.get(0).getLog10PriorOfAFGT0(); + } + + // In principle, if B_p = x and C_p = y are the probabilities of being poly for alleles B and C, + // the probability of being poly is (1 - B_p) * (1 - C_p) = (1 - x) * (1 - y). We want to estimate confidently + // log10((1 - x) * (1 - y)) which is log10(1 - x) + log10(1 - y). This sum is log10PosteriorOfACEq0 + // + // note we need to handle the case where the posterior of AF == 0 is 0.0, in which case we + // use the summed log10PosteriorOfACGt0Sum directly. This happens in cases where + // AF > 0 : 0.0 and AF == 0 : -16, and if you use the inverse calculation you get 0.0 and MathUtils.LOG10_P_OF_ZERO + final double log10PosteriorOfACGt0; + if ( log10PosteriorOfACEq0Sum == 0.0 ) + log10PosteriorOfACGt0 = log10PosteriorOfACGt0Sum; + else + log10PosteriorOfACGt0 = Math.max(Math.log10(1 - Math.pow(10, log10PosteriorOfACEq0Sum)), MathUtils.LOG10_P_OF_ZERO); + + final double[] log10LikelihoodsOfAC = new double[] { + // L + prior = posterior => L = poster - prior + log10PosteriorOfACEq0Sum - log10PriorsOfAC[0], + log10PosteriorOfACGt0 - log10PriorsOfAC[1] + }; + + return new MyAFCalcResult(alleleCountsOfMLE, nEvaluations, vc.getAlleles(), + // necessary to ensure all values < 0 + MathUtils.normalizeFromLog10(log10LikelihoodsOfAC, true), + // priors incorporate multiple alt alleles, must be normalized + MathUtils.normalizeFromLog10(log10PriorsOfAC, true), + log10pNonRefByAllele, sortedResultsWithThetaNPriors); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java new file mode 100644 index 000000000..fc26111e0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/OriginalDiploidExactAFCalc.java @@ -0,0 +1,153 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.broadinstitute.sting.utils.variantcontext.VariantContext; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.Map; + +/** + * Original bi-allelic ~O(N) implementation. Kept here for posterity and reference + */ +class OriginalDiploidExactAFCalc extends DiploidExactAFCalc { + protected OriginalDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + } + + @Override + protected AFCalcResult computeLog10PNonRef(VariantContext vc, double[] log10AlleleFrequencyPriors) { + final double[] log10AlleleFrequencyLikelihoods = new double[log10AlleleFrequencyPriors.length]; + final double[] log10AlleleFrequencyPosteriors = new double[log10AlleleFrequencyPriors.length]; + final Pair result = linearExact(vc, log10AlleleFrequencyPriors, log10AlleleFrequencyLikelihoods, log10AlleleFrequencyPosteriors); + final int lastK = result.getFirst(); + final int mleK = result.getSecond(); + + final double log10LikelihoodAFGt0 = lastK == 0 ? MathUtils.LOG10_P_OF_ZERO : MathUtils.log10sumLog10(log10AlleleFrequencyLikelihoods, 1, lastK+1); + final double[] log10Likelihoods = new double[]{log10AlleleFrequencyLikelihoods[0], log10LikelihoodAFGt0}; + final double[] log10Priors = new double[]{log10AlleleFrequencyPriors[0], MathUtils.log10sumLog10(log10AlleleFrequencyPriors, 1)}; + final double[] log10Posteriors = MathUtils.vectorSum(log10Likelihoods, log10Priors); + + final double log10PNonRef = log10Posteriors[1] > log10Posteriors[0] ? 0.0 : MathUtils.LOG10_P_OF_ZERO; + final Map log10pNonRefByAllele = Collections.singletonMap(vc.getAlternateAllele(0), log10PNonRef); + + return new AFCalcResult(new int[]{mleK}, 0, vc.getAlleles(), + MathUtils.normalizeFromLog10(log10Likelihoods, true), + MathUtils.normalizeFromLog10(log10Priors, true), + log10pNonRefByAllele); + } + + /** + * A simple data structure that holds the current, prev, and prev->prev likelihoods vectors + * for the exact model calculation + */ + private final static class ExactACCache { + double[] kMinus2, kMinus1, kMinus0; + + private static double[] create(int n) { + return new double[n]; + } + + public ExactACCache(int n) { + kMinus2 = create(n); + kMinus1 = create(n); + kMinus0 = create(n); + } + + final public void rotate() { + double[] tmp = kMinus2; + kMinus2 = kMinus1; + kMinus1 = kMinus0; + kMinus0 = tmp; + } + + final public double[] getkMinus2() { + return kMinus2; + } + + final public double[] getkMinus1() { + return kMinus1; + } + + final public double[] getkMinus0() { + return kMinus0; + } + } + + public Pair linearExact(final VariantContext vc, + double[] log10AlleleFrequencyPriors, + double[] log10AlleleFrequencyLikelihoods, + double[] log10AlleleFrequencyPosteriors) { + final ArrayList genotypeLikelihoods = getGLs(vc.getGenotypes(), true); + final int numSamples = genotypeLikelihoods.size()-1; + final int numChr = 2*numSamples; + + final ExactACCache logY = new ExactACCache(numSamples+1); + logY.getkMinus0()[0] = 0.0; // the zero case + + double maxLog10L = Double.NEGATIVE_INFINITY; + boolean done = false; + int lastK = -1, mleK = -1; + + for (int k=0; k <= numChr && ! done; k++ ) { + final double[] kMinus0 = logY.getkMinus0(); + + if ( k == 0 ) { // special case for k = 0 + for ( int j=1; j <= numSamples; j++ ) { + kMinus0[j] = kMinus0[j-1] + genotypeLikelihoods.get(j)[0]; + } + } else { // k > 0 + final double[] kMinus1 = logY.getkMinus1(); + final double[] kMinus2 = logY.getkMinus2(); + + for ( int j=1; j <= numSamples; j++ ) { + final double[] gl = genotypeLikelihoods.get(j); + final double logDenominator = MathUtils.log10Cache[2*j] + MathUtils.log10Cache[2*j-1]; + + double aa = Double.NEGATIVE_INFINITY; + double ab = Double.NEGATIVE_INFINITY; + if (k < 2*j-1) + aa = MathUtils.log10Cache[2*j-k] + MathUtils.log10Cache[2*j-k-1] + kMinus0[j-1] + gl[0]; + + if (k < 2*j) + ab = MathUtils.log10Cache[2*k] + MathUtils.log10Cache[2*j-k]+ kMinus1[j-1] + gl[1]; + + double log10Max; + if (k > 1) { + final double bb = MathUtils.log10Cache[k] + MathUtils.log10Cache[k-1] + kMinus2[j-1] + gl[2]; + log10Max = MathUtils.approximateLog10SumLog10(aa, ab, bb); + } else { + // we know we aren't considering the BB case, so we can use an optimized log10 function + log10Max = MathUtils.approximateLog10SumLog10(aa, ab); + } + + // finally, update the L(j,k) value + kMinus0[j] = log10Max - logDenominator; + } + } + + // update the posteriors vector + final double log10LofK = kMinus0[numSamples]; + log10AlleleFrequencyLikelihoods[k] = log10LofK; + log10AlleleFrequencyPosteriors[k] = log10LofK + log10AlleleFrequencyPriors[k]; + + // can we abort early? + lastK = k; + if ( log10LofK > maxLog10L ) { + maxLog10L = log10LofK; + mleK = k; + } + + if ( log10LofK < maxLog10L - StateTracker.MAX_LOG10_ERROR_TO_STOP_EARLY ) { + //if ( DEBUG ) System.out.printf(" *** breaking early k=%d log10L=%.2f maxLog10L=%.2f%n", k, log10LofK, maxLog10L); + done = true; + } + + logY.rotate(); + } + + return new Pair(lastK, mleK); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java new file mode 100644 index 000000000..97e5fed3b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/ReferenceDiploidExactAFCalc.java @@ -0,0 +1,7 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +public class ReferenceDiploidExactAFCalc extends DiploidExactAFCalc { + protected ReferenceDiploidExactAFCalc(int nSamples, int maxAltAlleles, final int ploidy) { + super(nSamples, maxAltAlleles, ploidy); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java new file mode 100644 index 000000000..b82ec1d29 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/StateTracker.java @@ -0,0 +1,282 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Keeps track of the state information during the exact model AF calculation. + * + * Tracks things like the MLE and MAP AC values, their corresponding likelhood and posterior + * values, the likelihood of the AF == 0 state, and the number of evaluations needed + * by the calculation to compute the P(AF == 0) + */ +final class StateTracker { + protected static final double VALUE_NOT_CALCULATED = Double.NEGATIVE_INFINITY; + protected final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 + + /** + * These variables are intended to contain the MLE and MAP (and their corresponding allele counts) + * of the site over all alternate alleles + */ + protected double log10MLE; + protected double log10MAP; + + /** + * Returns a vector with maxAltAlleles values containing AC values at the MLE + * + * The values of the ACs for this call are stored in the getAllelesUsedInGenotyping order, + * starting from index 0 (i.e., the first alt allele is at 0). The vector is always + * maxAltAlleles in length, and so only the first getAllelesUsedInGenotyping.size() - 1 values + * are meaningful. + */ + private final int[] alleleCountsOfMLE; + private final int[] alleleCountsOfMAP; + + /** + * A vector of log10 likelihood values seen, for future summation. When the size of the + * vector is exceeed -- because we've pushed more posteriors than there's space to hold + * -- we simply sum up the existing values, make that the first value, and continue. + */ + private final double[] log10LikelihoodsForAFGt0 = new double[LIKELIHOODS_CACHE_SIZE]; + private static final int LIKELIHOODS_CACHE_SIZE = 5000; + private int log10LikelihoodsForAFGt0CacheIndex = 0; + + /** + * The actual sum of the likelihoods. Null if the sum hasn't been computed yet + */ + protected Double log10LikelihoodsForAFGt0Sum = null; + + /** + * Contains the likelihood for the site's being monomorphic (i.e. AF=0 for all alternate alleles) + */ + private double log10LikelihoodOfAFzero = 0.0; + + /** + * The number of evaluates we've gone through in the AFCalc + */ + private int nEvaluations = 0; + + /** + * The list of alleles actually used in computing the AF + */ + private List allelesUsedInGenotyping = null; + + /** + * Create a results object capability of storing results for calls with up to maxAltAlleles + * + * @param maxAltAlleles an integer >= 1 + */ + public StateTracker(final int maxAltAlleles) { + if ( maxAltAlleles < 1 ) throw new IllegalArgumentException("maxAltAlleles must be >= 0, saw " + maxAltAlleles); + + alleleCountsOfMLE = new int[maxAltAlleles]; + alleleCountsOfMAP = new int[maxAltAlleles]; + + reset(); + } + + /** + * Is the likelihood of configuration K too low to consider, related to the + * maximum likelihood seen already? + * + * @param log10LofK the log10 likelihood of the configuration we're considering analyzing + * @return true if the configuration cannot meaningfully contribute to our likelihood sum + */ + private boolean tooLowLikelihood(final double log10LofK) { + return log10LofK < log10MLE - MAX_LOG10_ERROR_TO_STOP_EARLY; + } + + /** + * @return true iff all ACs in this object are less than or equal to their corresponding ACs in the provided set + */ + private boolean isLowerAC(final ExactACcounts otherACs) { + final int[] otherACcounts = otherACs.getCounts(); + + for ( int i = 0; i < otherACcounts.length; i++ ) { + if ( alleleCountsOfMLE[i] > otherACcounts[i] ) + return false; + } + + return true; + } + + /** + * Should we stop exploring paths from ACs, given it's log10LofK + * + * @param log10LofK the log10LofK of these ACs + * @param ACs the ACs of this state + * @return return true if there's no reason to continue with subpaths of AC, or false otherwise + */ + protected boolean abort( final double log10LofK, final ExactACcounts ACs, final boolean enforceLowerACs ) { + return tooLowLikelihood(log10LofK) && (!enforceLowerACs || isLowerAC(ACs)); + } + + @Ensures("result != null") + protected int[] getAlleleCountsOfMAP() { + return alleleCountsOfMAP; + } + + @Ensures("result >= 0") + protected int getnEvaluations() { + return nEvaluations; + } + + /** + * @return the likelihoods summed across all AC values for AC > 0 + */ + private double getLog10LikelihoodOfAFNotZero() { + if ( log10LikelihoodsForAFGt0Sum == null ) { + if ( log10LikelihoodsForAFGt0CacheIndex == 0 ) // there's nothing to sum up, so make the sum equal to the smallest thing we have + log10LikelihoodsForAFGt0Sum = MathUtils.LOG10_P_OF_ZERO; + else + log10LikelihoodsForAFGt0Sum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + } + return log10LikelihoodsForAFGt0Sum; + } + + /** + * @return the log10 likelihood of AF == 0 + */ + private double getLog10LikelihoodOfAFzero() { + return log10LikelihoodOfAFzero; + } + + /** + * Convert this state to an corresponding AFCalcResult. + * + * Assumes that the values in this state have been filled in with meaningful values during the calculation. + * For example, that the allelesUsedInGenotyping has been set, that the alleleCountsOfMLE contains meaningful + * values, etc. + * + * @param log10PriorsByAC the priors by AC + * + * @return an AFCalcResult summarizing the final results of this calculation + */ + @Requires("allelesUsedInGenotyping != null") + protected AFCalcResult toAFCalcResult(final double[] log10PriorsByAC) { + final int [] subACOfMLE = Arrays.copyOf(alleleCountsOfMLE, allelesUsedInGenotyping.size() - 1); + final double[] log10Likelihoods = MathUtils.normalizeFromLog10(new double[]{getLog10LikelihoodOfAFzero(), getLog10LikelihoodOfAFNotZero()}, true); + final double[] log10Priors = MathUtils.normalizeFromLog10(new double[]{log10PriorsByAC[0], MathUtils.log10sumLog10(log10PriorsByAC, 1)}, true); + + final Map log10pNonRefByAllele = new HashMap(allelesUsedInGenotyping.size()); + for ( int i = 0; i < subACOfMLE.length; i++ ) { + final Allele allele = allelesUsedInGenotyping.get(i+1); + final double log10PNonRef = alleleCountsOfMAP[i] > 0 ? 0 : -10000; // TODO -- a total hack but in effect what the old behavior was + log10pNonRefByAllele.put(allele, log10PNonRef); + } + + return new AFCalcResult(subACOfMLE, nEvaluations, allelesUsedInGenotyping, log10Likelihoods, log10Priors, log10pNonRefByAllele); + } + + // -------------------------------------------------------------------------------- + // + // Protected mutational methods only for use within the calculation models themselves + // + // -------------------------------------------------------------------------------- + + /** + * Reset the data in this results object, so that it can be used in a subsequent AF calculation + * + * Resetting of the data is done by the calculation model itself, so shouldn't be done by callers any longer + */ + protected void reset() { + log10MLE = log10MAP = log10LikelihoodOfAFzero = VALUE_NOT_CALCULATED; + log10LikelihoodsForAFGt0CacheIndex = 0; + log10LikelihoodsForAFGt0Sum = null; + allelesUsedInGenotyping = null; + nEvaluations = 0; + Arrays.fill(alleleCountsOfMLE, 0); + Arrays.fill(alleleCountsOfMAP, 0); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); + } + + /** + * Tell this result we used one more evaluation cycle + */ + protected void incNEvaluations() { + nEvaluations++; + } + + /** + * Update the maximum log10 likelihoods seen, if log10LofKs is higher, and the corresponding ACs of this state + * + * @param log10LofK the likelihood of our current configuration state, cannot be the 0 state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) + @Ensures("log10MLE == Math.max(log10LofK, log10MLE)") + protected void updateMLEifNeeded(final double log10LofK, final int[] alleleCountsForK) { + addToLikelihoodsCache(log10LofK); + + if ( log10LofK > log10MLE ) { + log10MLE = log10LofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMLE, 0, alleleCountsForK.length); + } + } + + /** + * Update the maximum log10 posterior seen, if log10PofKs is higher, and the corresponding ACs of this state + * + * @param log10PofK the posterior of our current configuration state + * @param alleleCountsForK the allele counts for this state + */ + @Requires({"alleleCountsForK != null", "MathUtils.sum(alleleCountsForK) >= 0"}) + @Ensures("log10MAP == Math.max(log10PofK, log10MAP)") + protected void updateMAPifNeeded(final double log10PofK, final int[] alleleCountsForK) { + if ( log10PofK > log10MAP ) { + log10MAP = log10PofK; + System.arraycopy(alleleCountsForK, 0, alleleCountsOfMAP, 0, alleleCountsForK.length); + } + } + + private void addToLikelihoodsCache(final double log10LofK) { + // add to the cache + log10LikelihoodsForAFGt0[log10LikelihoodsForAFGt0CacheIndex++] = log10LofK; + + // if we've filled up the cache, then condense by summing up all of the values and placing the sum back into the first cell + if ( log10LikelihoodsForAFGt0CacheIndex == LIKELIHOODS_CACHE_SIZE) { + final double temporarySum = MathUtils.log10sumLog10(log10LikelihoodsForAFGt0, 0, log10LikelihoodsForAFGt0CacheIndex); + Arrays.fill(log10LikelihoodsForAFGt0, Double.POSITIVE_INFINITY); + log10LikelihoodsForAFGt0[0] = temporarySum; + log10LikelihoodsForAFGt0CacheIndex = 1; + } + } + + protected void setLog10LikelihoodOfAFzero(final double log10LikelihoodOfAFzero) { + this.log10LikelihoodOfAFzero = log10LikelihoodOfAFzero; + if ( log10LikelihoodOfAFzero > log10MLE ) { + log10MLE = log10LikelihoodOfAFzero; + Arrays.fill(alleleCountsOfMLE, 0); + } + } + + @Requires({"MathUtils.goodLog10Probability(log10PosteriorOfAFzero)"}) + protected void setLog10PosteriorOfAFzero(final double log10PosteriorOfAFzero) { + if ( log10PosteriorOfAFzero > log10MAP ) { + log10MAP = log10PosteriorOfAFzero; + Arrays.fill(alleleCountsOfMAP, 0); + } + } + + /** + * Set the list of alleles used in genotyping + * + * @param allelesUsedInGenotyping the list of alleles, where the first allele is reference + */ + @Requires({"allelesUsedInGenotyping != null", "allelesUsedInGenotyping.size() > 1"}) + protected void setAllelesUsedInGenotyping(List allelesUsedInGenotyping) { + if ( allelesUsedInGenotyping == null || allelesUsedInGenotyping.isEmpty() ) + throw new IllegalArgumentException("allelesUsedInGenotyping cannot be null or empty"); + if ( allelesUsedInGenotyping.get(0).isNonReference() ) + throw new IllegalArgumentException("The first element of allelesUsedInGenotyping must be the reference allele"); + + this.allelesUsedInGenotyping = allelesUsedInGenotyping; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index d61b9e9b6..998894fbf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -36,8 +36,8 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.BAQMode; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.*; @@ -112,7 +112,7 @@ import java.util.*; * @author ebanks */ @DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} ) -@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = BAQ.ApplicationTime.ON_OUTPUT) +@BAQMode(QualityMode = BAQ.QualityMode.ADD_TAG, ApplicationTime = ReadTransformer.ApplicationTime.ON_OUTPUT) public class IndelRealigner extends ReadWalker { public static final String ORIGINAL_CIGAR_TAG = "OC"; @@ -370,8 +370,6 @@ public class IndelRealigner extends ReadWalker { currentInterval = intervals.hasNext() ? intervals.next() : null; - writerToUse = writer; - if ( N_WAY_OUT != null ) { boolean createIndex = true; @@ -383,9 +381,9 @@ public class IndelRealigner extends ReadWalker { createIndex, generateMD5s,createProgramRecord(),KEEP_ALL_PG_RECORDS); } } else { - // set up the output writer setupWriter(getToolkit().getSAMFileHeader()); + writerToUse = writer; } manager = new ConstrainedMateFixingManager(writerToUse, getToolkit().getGenomeLocParser(), MAX_ISIZE_FOR_MOVEMENT, MAX_POS_MOVE_ALLOWED, MAX_RECORDS_IN_MEMORY); @@ -473,7 +471,7 @@ public class IndelRealigner extends ReadWalker { readsActuallyCleaned.clear(); } - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { if ( currentInterval == null ) { emit(read); return 0; @@ -540,7 +538,7 @@ public class IndelRealigner extends ReadWalker { // TODO -- it would be nice if we could use indels from 454/Ion reads as alternate consenses } - private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker, GenomeLoc readLoc) { + private void cleanAndCallMap(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker, GenomeLoc readLoc) { if ( readsToClean.size() > 0 ) { GenomeLoc earliestPossibleMove = getToolkit().getGenomeLocParser().createGenomeLoc(readsToClean.getReads().get(0)); if ( manager.canMoveReads(earliestPossibleMove) ) @@ -619,17 +617,12 @@ public class IndelRealigner extends ReadWalker { } } - private void populateKnownIndels(ReadMetaDataTracker metaDataTracker, ReferenceContext ref) { - for ( Collection rods : metaDataTracker.getContigOffsetMapping().values() ) { - Iterator rodIter = rods.iterator(); - while ( rodIter.hasNext() ) { - Object rod = rodIter.next().getUnderlyingObject(); - if ( indelRodsSeen.contains(rod) ) - continue; - indelRodsSeen.add(rod); - if ( rod instanceof VariantContext ) - knownIndelsToTry.add((VariantContext)rod); - } + private void populateKnownIndels(RefMetaDataTracker metaDataTracker, ReferenceContext ref) { + for ( final VariantContext vc : metaDataTracker.getValues(known) ) { + if ( indelRodsSeen.contains(vc) ) + continue; + indelRodsSeen.add(vc); + knownIndelsToTry.add(vc); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java index b08def44f..21b3b71d8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/LeftAlignIndels.java @@ -27,12 +27,11 @@ package org.broadinstitute.sting.gatk.walkers.indels; import net.sf.samtools.Cigar; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.sam.AlignmentUtils; @@ -80,9 +79,9 @@ public class LeftAlignIndels extends ReadWalker { writer.addAlignment(read); } - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { // we can not deal with screwy records - if ( read.getCigar().numCigarElements() == 0 ) { + if ( read.getReadUnmappedFlag() || read.getCigar().numCigarElements() == 0 ) { emit(read); return 0; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 65c5a2fbc..79962a3e4 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -27,26 +27,30 @@ package org.broadinstitute.sting.gatk.walkers.indels; import com.google.java.contract.Ensures; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.PairHMM; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.pairhmm.ExactPairHMM; +import org.broadinstitute.sting.utils.pairhmm.OriginalPairHMM; +import org.broadinstitute.sting.utils.pairhmm.PairHMM; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; +import java.io.PrintStream; import java.util.Arrays; -import java.util.HashMap; import java.util.LinkedHashMap; +import java.util.Map; public class PairHMMIndelErrorModel { public static final int BASE_QUAL_THRESHOLD = 20; private boolean DEBUG = false; - private boolean bandedLikelihoods = false; private static final int MAX_CACHED_QUAL = 127; @@ -65,6 +69,8 @@ public class PairHMMIndelErrorModel { private final byte[] GAP_OPEN_PROB_TABLE; private final byte[] GAP_CONT_PROB_TABLE; + private final PairHMM pairHMM; + ///////////////////////////// // Private Member Variables ///////////////////////////// @@ -83,15 +89,26 @@ public class PairHMMIndelErrorModel { } } - public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, boolean bandedLikelihoods) { + public PairHMMIndelErrorModel(byte indelGOP, byte indelGCP, boolean deb, final PairHMM.HMM_IMPLEMENTATION hmmType ) { this.DEBUG = deb; - this.bandedLikelihoods = bandedLikelihoods; + + switch (hmmType) { + case EXACT: + pairHMM = new ExactPairHMM(); + break; + case ORIGINAL: + pairHMM = new OriginalPairHMM(); + break; + case CACHING: + case LOGLESS_CACHING: + default: + throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the UnifiedGenotyper. Acceptable options are ORIGINAL and EXACT."); + } // fill gap penalty table, affine naive model: this.GAP_CONT_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; this.GAP_OPEN_PROB_TABLE = new byte[MAX_HRUN_GAP_IDX]; - for (int i = 0; i < START_HRUN_GAP_IDX; i++) { GAP_OPEN_PROB_TABLE[i] = indelGOP; GAP_CONT_PROB_TABLE[i] = indelGCP; @@ -167,11 +184,18 @@ public class PairHMMIndelErrorModel { } - public synchronized double[] computeDiploidReadHaplotypeLikelihoods(ReadBackedPileup pileup, LinkedHashMap haplotypeMap, ReferenceContext ref, int eventLength, HashMap> indelLikelihoodMap){ + public synchronized double[] computeDiploidReadHaplotypeLikelihoods(final ReadBackedPileup pileup, + final LinkedHashMap haplotypeMap, + final ReferenceContext ref, + final int eventLength, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, + final double downsamplingFraction, + final PrintStream downsamplingLog) { final int numHaplotypes = haplotypeMap.size(); final int readCounts[] = new int[pileup.getNumberOfElements()]; - final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, indelLikelihoodMap, readCounts); + final double[][] readLikelihoods = computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, ref, eventLength, perReadAlleleLikelihoodMap, readCounts); + perReadAlleleLikelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog); return getDiploidHaplotypeLikelihoods(numHaplotypes, readCounts, readLikelihoods); } @@ -181,10 +205,9 @@ public class PairHMMIndelErrorModel { final LinkedHashMap haplotypeMap, final ReferenceContext ref, final int eventLength, - final HashMap> indelLikelihoodMap, + final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap, final int[] readCounts) { final double readLikelihoods[][] = new double[pileup.getNumberOfElements()][haplotypeMap.size()]; - final PairHMM pairHMM = new PairHMM(bandedLikelihoods); int readIdx=0; for (PileupElement p: pileup) { @@ -192,8 +215,8 @@ public class PairHMMIndelErrorModel { readCounts[readIdx] = p.getRepresentativeCount(); // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location) - if (indelLikelihoodMap.containsKey(p)) { - HashMap el = indelLikelihoodMap.get(p); + if (perReadAlleleLikelihoodMap.containsPileupElement(p)) { + Map el = perReadAlleleLikelihoodMap.getLikelihoodsAssociatedWithPileupElement(p); int j=0; for (Allele a: haplotypeMap.keySet()) { readLikelihoods[readIdx][j++] = el.get(a); @@ -201,7 +224,7 @@ public class PairHMMIndelErrorModel { } else { final int refWindowStart = ref.getWindow().getStart(); - final int refWindowStop = ref.getWindow().getStop(); + final int refWindowStop = ref.getWindow().getStop(); if (DEBUG) { System.out.format("Read Name:%s, aln start:%d aln stop:%d orig cigar:%s\n",p.getRead().getReadName(), p.getRead().getAlignmentStart(), p.getRead().getAlignmentEnd(), p.getRead().getCigarString()); @@ -280,7 +303,7 @@ public class PairHMMIndelErrorModel { System.out.format("numStartSoftClippedBases: %d numEndSoftClippedBases: %d WinStart:%d WinStop:%d start: %d stop: %d readLength: %d\n", numStartSoftClippedBases, numEndSoftClippedBases, ref.getWindow().getStart(), ref.getWindow().getStop(), startLocationInRefForHaplotypes, stopLocationInRefForHaplotypes, read.getReadLength()); - LinkedHashMap readEl = new LinkedHashMap(); + // LinkedHashMap readEl = new LinkedHashMap(); /** * Check if we'll end up with an empty read once all clipping is done @@ -288,7 +311,7 @@ public class PairHMMIndelErrorModel { if (numStartSoftClippedBases + numEndSoftClippedBases >= unclippedReadBases.length) { int j=0; for (Allele a: haplotypeMap.keySet()) { - readEl.put(a,0.0); + perReadAlleleLikelihoodMap.add(p,a,0.0); readLikelihoods[readIdx][j++] = 0.0; } } @@ -297,8 +320,6 @@ public class PairHMMIndelErrorModel { final byte[] readQuals = Arrays.copyOfRange(unclippedReadQuals,numStartSoftClippedBases, unclippedReadBases.length-numEndSoftClippedBases); int j=0; - // initialize path metric and traceback memories for likelihood computation - double[][] matchMetricArray = null, XMetricArray = null, YMetricArray = null; byte[] previousHaplotypeSeen = null; final byte[] contextLogGapOpenProbabilities = new byte[readBases.length]; final byte[] contextLogGapContinuationProbabilities = new byte[readBases.length]; @@ -329,45 +350,39 @@ public class PairHMMIndelErrorModel { - final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), - (int)indStart, (int)indStop); + final byte[] haplotypeBases = Arrays.copyOfRange(haplotype.getBases(), + (int)indStart, (int)indStop); - final int X_METRIC_LENGTH = readBases.length+2; - final int Y_METRIC_LENGTH = haplotypeBases.length+2; + final int X_METRIC_LENGTH = readBases.length+2; + final int Y_METRIC_LENGTH = haplotypeBases.length+2; - if (matchMetricArray == null) { - //no need to reallocate arrays for each new haplotype, as length won't change - matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + if (previousHaplotypeSeen == null) { + //no need to reallocate arrays for each new haplotype, as length won't change + pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); + } + + int startIndexInHaplotype = 0; + if (previousHaplotypeSeen != null) + startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); + previousHaplotypeSeen = haplotypeBases.clone(); + + readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotypeBases, readBases, readQuals, + (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), + (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), + contextLogGapContinuationProbabilities, startIndexInHaplotype, false); - PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); - } + if (DEBUG) { + System.out.println("H:"+new String(haplotypeBases)); + System.out.println("R:"+new String(readBases)); + System.out.format("L:%4.2f\n",readLikelihood); + System.out.format("StPos:%d\n", startIndexInHaplotype); + } - int startIndexInHaplotype = 0; - if (previousHaplotypeSeen != null) - startIndexInHaplotype = computeFirstDifferingPosition(haplotypeBases, previousHaplotypeSeen); - previousHaplotypeSeen = haplotypeBases.clone(); - - readLikelihood = pairHMM.computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, - (read.hasBaseIndelQualities() ? read.getBaseInsertionQualities() : contextLogGapOpenProbabilities), - (read.hasBaseIndelQualities() ? read.getBaseDeletionQualities() : contextLogGapOpenProbabilities), - contextLogGapContinuationProbabilities, - startIndexInHaplotype, matchMetricArray, XMetricArray, YMetricArray); - - - if (DEBUG) { - System.out.println("H:"+new String(haplotypeBases)); - System.out.println("R:"+new String(readBases)); - System.out.format("L:%4.2f\n",readLikelihood); - System.out.format("StPos:%d\n", startIndexInHaplotype); - } - readEl.put(a,readLikelihood); + perReadAlleleLikelihoodMap.add(p, a, readLikelihood); readLikelihoods[readIdx][j++] = readLikelihood; } } - indelLikelihoodMap.put(p,readEl); } readIdx++; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java index fc6df6902..b14dc9cc9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreator.java @@ -33,10 +33,10 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.pileup.PileupElement; @@ -57,7 +57,7 @@ import java.util.TreeSet; * is minimized across all the reads. In general, a large percent of regions requiring local realignment are due to the presence of an insertion * or deletion (indels) in the individual's genome with respect to the reference genome. Such alignment artifacts result in many bases mismatching * the reference near the misalignment, which are easily mistaken as SNPs. Moreover, since read mapping algorithms operate on each read independently, - * it is impossible to place reads on the reference genome such at mismatches are minimized across all reads. Consequently, even when some reads are + * it is impossible to place reads on the reference genome such that mismatches are minimized across all reads. Consequently, even when some reads are * correctly mapped with indels, reads covering the indel near just the start or end of the read are often incorrectly mapped with respect the true indel, * also requiring realignment. Local realignment serves to transform regions with misalignments due to indels into clean reads containing a consensus * indel suitable for standard variant discovery approaches. Unlike most mappers, this walker uses the full alignment context to determine whether an @@ -69,7 +69,7 @@ import java.util.TreeSet; *
  • Running the realigner over those intervals (see the IndelRealigner tool)
  • * *

    - * An important note: the input bam(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. + * An important note: the input BAM(s), reference, and known indel file(s) should be the same ones to be used for the IndelRealigner step. *

    * Another important note: because reads produced from the 454 technology inherently contain false indels, the realigner will not currently work with them * (or with reads from similar technologies). This tool also ignores MQ0 reads and reads with consecutive indel operators in the CIGAR string. @@ -101,7 +101,7 @@ import java.util.TreeSet; @Reference(window=@Window(start=-1,stop=50)) @Allows(value={DataSource.READS, DataSource.REFERENCE}) @By(DataSource.REFERENCE) -@BAQMode(ApplicationTime = BAQ.ApplicationTime.FORBIDDEN) +@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) public class RealignerTargetCreator extends RodWalker implements TreeReducible { /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java index 3965a63fb..0165c6cf3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/SomaticIndelDetector.java @@ -39,7 +39,7 @@ import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.filters.MappingQualityZeroFilter; import org.broadinstitute.sting.gatk.filters.Platform454Filter; import org.broadinstitute.sting.gatk.filters.PlatformUnitFilter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.SeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrack; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; @@ -438,8 +438,6 @@ public class SomaticIndelDetector extends ReadWalker { location = getToolkit().getGenomeLocParser().createGenomeLoc(getToolkit().getSAMFileHeader().getSequence(0).getSequenceName(),1); - normalSamples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeaders().get(0)); - try { // we already checked that bedOutput and output_file are not set simultaneously if ( bedOutput != null ) bedWriter = new FileWriter(bedOutput); @@ -477,7 +475,7 @@ public class SomaticIndelDetector extends ReadWalker { @Override - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) { // if ( read.getReadName().equals("428EFAAXX090610:2:36:1384:639#0") ) System.out.println("GOT READ"); @@ -1152,8 +1150,9 @@ public class SomaticIndelDetector extends ReadWalker { GenotypesContext genotypes = GenotypesContext.create(); for ( String sample : normalSamples ) { - final GenotypeBuilder gb = new GenotypeBuilder(sample); - gb.attributes(call.makeStatsAttributes(null)); + GenotypeBuilder gb = new GenotypeBuilder(sample); + + gb=call.addStatsAttributes(gb); gb.alleles(! discard_event ? alleles // we made a call - put actual het genotype here: : homref_alleles); // no call: genotype is ref/ref (but alleles still contain the alt if we observed anything at all) @@ -1200,8 +1199,11 @@ public class SomaticIndelDetector extends ReadWalker { if ( start == 0 ) return; - Map attrsNormal = nCall.makeStatsAttributes(null); - Map attrsTumor = tCall.makeStatsAttributes(null); + GenotypeBuilder nGB = new GenotypeBuilder(); + GenotypeBuilder tGB = new GenotypeBuilder(); + + nCall.addStatsAttributes(nGB); + tCall.addStatsAttributes(tGB); Map attrs = new HashMap(); @@ -1242,11 +1244,11 @@ public class SomaticIndelDetector extends ReadWalker { GenotypesContext genotypes = GenotypesContext.create(); for ( String sample : normalSamples ) { - genotypes.add(GenotypeBuilder.create(sample, homRefN ? homRefAlleles : alleles, attrsNormal)); + genotypes.add(nGB.name(sample).alleles(homRefN ? homRefAlleles : alleles).make()); } for ( String sample : tumorSamples ) { - genotypes.add(GenotypeBuilder.create(sample, homRefT ? homRefAlleles : alleles, attrsTumor)); + genotypes.add(tGB.name(sample).alleles(homRefT ? homRefAlleles : alleles).make()); } Set filters = null; @@ -1254,14 +1256,6 @@ public class SomaticIndelDetector extends ReadWalker { filters = new HashSet(); filters.add("NoCall"); } - if ( nCall.getCoverage() < minNormalCoverage ) { - if ( filters == null ) filters = new HashSet(); - filters.add("NCov"); - } - if ( tCall.getCoverage() < minCoverage ) { - if ( filters == null ) filters = new HashSet(); - filters.add("TCov"); - } VariantContext vc = new VariantContextBuilder("IGv2_Indel_call", refName, start, stop, alleles) .genotypes(genotypes).filters(filters).attributes(attrs).make(); @@ -1844,6 +1838,38 @@ public class SomaticIndelDetector extends ReadWalker { VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,attr); return attr; } + + + /** + * Adds alignment statistics directly into the genotype builder object. + * + * @param gb + * @return + */ + public GenotypeBuilder addStatsAttributes(GenotypeBuilder gb) { + if ( gb == null ) gb = new GenotypeBuilder(); + + gb = VCFIndelAttributes.recordDepth(getConsensusVariantCount(),getAllVariantCount(),getCoverage(),gb); + + gb = VCFIndelAttributes.recordAvMM(getAvConsensusMismatches(),getAvRefMismatches(),gb); + + gb = VCFIndelAttributes.recordAvMapQ(getAvConsensusMapq(),getAvRefMapq(),gb); + + gb = VCFIndelAttributes.recordNQSMMRate(getNQSConsensusMMRate(),getNQSRefMMRate(),gb); + + gb = VCFIndelAttributes.recordNQSAvQ(getNQSConsensusAvQual(),getNQSRefAvQual(),gb); + + gb = VCFIndelAttributes.recordOffsetFromStart(from_start_median,from_start_mad,gb); + + gb = VCFIndelAttributes.recordOffsetFromEnd(from_end_median,from_end_mad,gb); + + PrimitivePair.Int strand_cons = getConsensusStrandCounts(); + PrimitivePair.Int strand_ref = getRefStrandCounts(); + + gb = VCFIndelAttributes.recordStrandCounts(strand_cons.first,strand_cons.second,strand_ref.first,strand_ref.second,gb); + return gb; + } + } interface IndelListener { @@ -2170,18 +2196,18 @@ class VCFIndelAttributes { public static Set getAttributeHeaderLines() { Set lines = new HashSet(); - lines.add(new VCFFormatHeaderLine(ALLELIC_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "# of reads supporting consensus indel/reference at the site")); + lines.add(new VCFFormatHeaderLine(ALLELIC_DEPTH_KEY, 2, VCFHeaderLineType.Integer, "# of reads supporting consensus reference/indel at the site")); lines.add(new VCFFormatHeaderLine(DEPTH_TOTAL_KEY, 1, VCFHeaderLineType.Integer, "Total coverage at the site")); - lines.add(new VCFFormatHeaderLine(MAPQ_KEY, 2, VCFHeaderLineType.Float, "Average mapping qualities of consensus indel-supporting reads/reference-supporting reads")); + lines.add(new VCFFormatHeaderLine(MAPQ_KEY, 2, VCFHeaderLineType.Float, "Average mapping qualities of ref-/consensus indel-supporting reads")); - lines.add(new VCFFormatHeaderLine(MM_KEY, 2, VCFHeaderLineType.Float, "Average # of mismatches per consensus indel-supporting read/per reference-supporting read")); + lines.add(new VCFFormatHeaderLine(MM_KEY, 2, VCFHeaderLineType.Float, "Average # of mismatches per ref-/consensus indel-supporting read")); - lines.add(new VCFFormatHeaderLine(NQS_MMRATE_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: fraction of mismatching bases in consensus indel-supporting reads/in reference-supporting reads")); + lines.add(new VCFFormatHeaderLine(NQS_MMRATE_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: fraction of mismatching bases in ref/consensus indel-supporting reads")); - lines.add(new VCFFormatHeaderLine(NQS_AVQ_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: average quality of bases from consensus indel-supporting reads/from reference-supporting reads")); + lines.add(new VCFFormatHeaderLine(NQS_AVQ_KEY, 2, VCFHeaderLineType.Float, "Within NQS window: average quality of bases in ref-/consensus indel-supporting reads")); - lines.add(new VCFFormatHeaderLine(STRAND_COUNT_KEY, 4, VCFHeaderLineType.Integer, "Strandness: counts of forward-/reverse-aligned indel-supporting reads / forward-/reverse-aligned reference supporting reads")); + lines.add(new VCFFormatHeaderLine(STRAND_COUNT_KEY, 4, VCFHeaderLineType.Integer, "Strandness: counts of forward-/reverse-aligned reference and indel-supporting reads (FwdRef,RevRef,FwdIndel,RevIndel)")); lines.add(new VCFFormatHeaderLine(RSTART_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the starts of the reads")); lines.add(new VCFFormatHeaderLine(REND_OFFSET_KEY, 2, VCFHeaderLineType.Integer, "Median/mad of indel offsets from the ends of the reads")); @@ -2194,39 +2220,72 @@ class VCFIndelAttributes { return attrs; } + public static GenotypeBuilder recordStrandCounts(int cnt_cons_fwd, int cnt_cons_rev, int cnt_ref_fwd, int cnt_ref_rev, + GenotypeBuilder gb) { + return gb.attribute(STRAND_COUNT_KEY, new Integer[] {cnt_ref_fwd, cnt_ref_rev,cnt_cons_fwd, cnt_cons_rev } ); + } + public static Map recordDepth(int cnt_cons, int cnt_indel, int cnt_total, Map attrs) { - attrs.put(ALLELIC_DEPTH_KEY, new Integer[] {cnt_cons, cnt_indel} ); + attrs.put(ALLELIC_DEPTH_KEY, new Integer[] {cnt_total-cnt_indel, cnt_cons} ); attrs.put(DEPTH_TOTAL_KEY, cnt_total); return attrs; } + public static GenotypeBuilder recordDepth(int cnt_cons, int cnt_indel, int cnt_total, GenotypeBuilder gb) { + return gb.AD(new int[] {cnt_total-cnt_indel,cnt_cons} ).DP(cnt_total); + } + public static Map recordAvMapQ(double cons, double ref, Map attrs) { - attrs.put(MAPQ_KEY, new Float[] {(float)cons, (float)ref} ); + attrs.put(MAPQ_KEY, new Float[] {(float)ref, (float)cons} ); return attrs; } + public static GenotypeBuilder recordAvMapQ(double cons, double ref, GenotypeBuilder gb) { + return gb.attribute(MAPQ_KEY,new float[] {(float)ref, (float)cons} ); + } + public static Map recordAvMM(double cons, double ref, Map attrs) { - attrs.put(MM_KEY, new Float[] {(float)cons, (float)ref} ); + attrs.put(MM_KEY, new Float[] {(float)ref, (float)cons} ); return attrs; } + public static GenotypeBuilder recordAvMM(double cons, double ref, GenotypeBuilder gb) { + return gb.attribute(MM_KEY, new float[] {(float)ref, (float)cons} ); + } + public static Map recordNQSMMRate(double cons, double ref, Map attrs) { - attrs.put(NQS_MMRATE_KEY, new Float[] {(float)cons, (float)ref} ); + attrs.put(NQS_MMRATE_KEY, new Float[] {(float)ref, (float)cons} ); return attrs; } + public static GenotypeBuilder recordNQSMMRate(double cons, double ref, GenotypeBuilder gb) { + return gb.attribute(NQS_MMRATE_KEY, new float[] {(float)ref, (float)cons} ); + } + public static Map recordNQSAvQ(double cons, double ref, Map attrs) { - attrs.put(NQS_AVQ_KEY, new Float[] {(float)cons, (float)ref} ); + attrs.put(NQS_AVQ_KEY, new float[] {(float)ref, (float)cons} ); return attrs; } + public static GenotypeBuilder recordNQSAvQ(double cons, double ref, GenotypeBuilder gb) { + return gb.attribute(NQS_AVQ_KEY, new float[] {(float)ref, (float)cons} ); + } + public static Map recordOffsetFromStart(int median, int mad, Map attrs) { attrs.put(RSTART_OFFSET_KEY, new Integer[] {median, mad} ); return attrs; } + public static GenotypeBuilder recordOffsetFromStart(int median, int mad, GenotypeBuilder gb) { + return gb.attribute(RSTART_OFFSET_KEY, new int[] {median, mad} ); + } + public static Map recordOffsetFromEnd(int median, int mad, Map attrs) { attrs.put(REND_OFFSET_KEY, new Integer[] {median, mad} ); return attrs; } + + public static GenotypeBuilder recordOffsetFromEnd(int median, int mad, GenotypeBuilder gb) { + return gb.attribute(REND_OFFSET_KEY, new int[] {median, mad} ); + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java index bbd4bf92f..7ebfec49e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhaseByTransmission.java @@ -36,7 +36,7 @@ import java.util.*; *

      *
    • In parent/child pairs: If an individual genotype is missing at one site, the other one is phased if it is homozygous. No phasing probability is emitted.
    • *
    • In trios: If the child is missing, parents are treated as separate individuals and phased if homozygous. No phasing probability is emitted.
    • - *
    • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probabilitt is emitted.
    • + *
    • In trios: If one of the parents is missing, it is handled like a parent/child pair. Phasing is done unless both the parent and child are heterozygous and a phasing probability is emitted.
    • *
    • In trios: If two individuals are missing, the remaining individual is phased if it is homozygous. No phasing probability is emitted.
    • *
    * @@ -541,7 +541,7 @@ public class PhaseByTransmission extends RodWalker, HashMa //Get a Map of genotype likelihoods. //In case of null, unavailable or no call, all likelihoods are 1/3. private EnumMap getLikelihoodsAsMapSafeNull(Genotype genotype){ - if(genotype == null || !genotype.isCalled()){ + if(genotype == null || !genotype.isCalled() || genotype.getLikelihoods() == null){ EnumMap likelihoods = new EnumMap(GenotypeType.class); likelihoods.put(GenotypeType.HOM_REF,1.0/3.0); likelihoods.put(GenotypeType.HET,1.0/3.0); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java new file mode 100755 index 000000000..d38c11594 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedCoverage.java @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.CommandLineGATK; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.filters.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.*; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.*; + +/** + * Emits intervals present in either the original or reduced bam but not the other. + * + *

    Input

    + *

    + * The original and reduced BAM files. + *

    + * + *

    Output

    + *

    + * A list of intervals present in one bam but not the other. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -I:original original.bam \
    + *   -I:reduced reduced.bam \
    + *   -R ref.fasta \
    + *   -T AssessReducedCoverage \
    + *   -o output.intervals
    + * 
    + * + * @author ebanks + */ +@DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) +@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class}) +@Hidden +public class AssessReducedCoverage extends LocusWalker implements TreeReducible { + + private static final String original = "original"; + private static final String reduced = "reduced"; + + @Output + protected PrintStream out; + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + @Argument(fullName = "output_reduced_only_coverage", shortName = "output_reduced_only_coverage", doc = "Output an interval if the reduced bam has coverage where the original does not", required = false) + public boolean OUTPUT_REDUCED_ONLY_INTERVALS = false; + + public void initialize() {} + + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + + if ( tracker == null ) + return null; + + Set tags = getAllTags(context.getBasePileup()); + return (tags.contains(original) && !tags.contains(reduced)) || + (OUTPUT_REDUCED_ONLY_INTERVALS && tags.contains(reduced) && !tags.contains(original)) ? ref.getLocus() : null; + } + + private Set getAllTags(final ReadBackedPileup pileup) { + + final Set tags = new HashSet(10); + + for ( final PileupElement p : pileup ) { + if ( (int)p.getQual() > 2 && p.getMappingQual() > 0 && !p.isDeletion() ) + tags.addAll(getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags()); + } + + return tags; + } + + public void onTraversalDone(GenomeLoc sum) { + if ( sum != null ) + out.println(sum); + } + + public GenomeLoc reduceInit() { + return null; + } + + public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { + if ( lhs == null ) + return rhs; + + if ( rhs == null ) + return lhs; + + // if contiguous, just merge them + if ( lhs.contiguousP(rhs) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); + + // otherwise, print the lhs and start over with the rhs + out.println(lhs); + return rhs; + } + + public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { + if ( value == null ) + return sum; + + if ( sum == null ) + return value; + + // if contiguous, just merge them + if ( sum.contiguousP(value) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); + + // otherwise, print the sum and start over with the value + out.println(sum); + return value; + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java new file mode 100644 index 000000000..78bcf1228 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java @@ -0,0 +1,173 @@ +package org.broadinstitute.sting.gatk.walkers.qc; + +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.TreeReducible; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; + +import java.io.PrintStream; +import java.util.List; + +/** + * Emits intervals in which the differences between the original and reduced bam quals are bigger epsilon (unless the quals of + * the reduced bam are above sufficient threshold) + * + *

    Input

    + *

    + * The original and reduced BAM files. + *

    + * + *

    Output

    + *

    + * A list of intervals in which the differences between the original and reduced bam quals are bigger epsilon. + *

    + * + *

    Examples

    + *
    + * java -Xmx2g -jar GenomeAnalysisTK.jar \
    + *   -I:original original.bam \
    + *   -I:reduced reduced.bam \
    + *   -R ref.fasta \
    + *   -T AssessReducedQuals \
    + *   -o output.intervals
    + * 
    + * + * @author ami + */ + +public class AssessReducedQuals extends LocusWalker implements TreeReducible { + + private static final String reduced = "reduced"; + private static final String original = "original"; + private static final int originalQualsIndex = 0; + private static final int reducedQualsIndex = 1; + + @Argument(fullName = "sufficientQualSum", shortName = "sufficientQualSum", doc = "When a reduced bam qual sum is above this threshold, it passes even without comparing to the non-reduced bam ", required = false) + public int sufficientQualSum = 600; + + @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > epsilon*Quals_original_bam we output this interval", required = false) + public int qual_epsilon = 0; + + @Argument(fullName = "debugLevel", shortName = "debug", doc = "debug mode on") // TODO -- best to make this optional + public int debugLevel = 0; // TODO -- best to make this an enum or boolean + + @Output + protected PrintStream out; + + public void initialize() { + if (debugLevel != 0) + out.println(" Debug mode" + + "Debug:\tsufficientQualSum: "+sufficientQualSum+ "\n " + + "Debug:\tqual_epsilon: "+qual_epsilon); + } + + @Override + public boolean includeReadsWithDeletionAtLoci() { return true; } + + @Override + public GenomeLoc map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + if ( tracker == null ) + return null; + + boolean reportLocus; + final int[] quals = getPileupQuals(context.getBasePileup()); + if (debugLevel != 0) + out.println("Debug:\tLocus Quals\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]); + final int epsilon = MathUtils.fastRound(quals[originalQualsIndex]*qual_epsilon); + final int calcOriginalQuals = Math.min(quals[originalQualsIndex],sufficientQualSum); + final int calcReducedQuals = Math.min(quals[reducedQualsIndex],sufficientQualSum); + final int OriginalReducedQualDiff = calcOriginalQuals - calcReducedQuals; + reportLocus = OriginalReducedQualDiff > epsilon || OriginalReducedQualDiff < -1*epsilon; + if(debugLevel != 0 && reportLocus) + out.println("Debug:\tEmited Locus\t"+ref.getLocus()+"\toriginal\t"+quals[originalQualsIndex]+"\treduced\t"+quals[reducedQualsIndex]+"\tepsilon\t"+epsilon+"\tdiff\t"+OriginalReducedQualDiff); + + return reportLocus ? ref.getLocus() : null; + } + + private final int[] getPileupQuals(final ReadBackedPileup readPileup) { + + final int[] quals = new int[2]; + String[] printPileup = {"Debug 2:\toriginal pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n", + "Debug 2:\t reduced pileup:\t"+readPileup.getLocation()+"\nDebug 2:----------------------------------\n"}; + + for( PileupElement p : readPileup ){ + final List tags = getToolkit().getReaderIDForRead(p.getRead()).getTags().getPositionalTags(); + if ( isGoodRead(p,tags) ){ + final int tempQual = (int)(p.getQual()) * p.getRepresentativeCount(); + final int tagIndex = getTagIndex(tags); + quals[tagIndex] += tempQual; + if(debugLevel == 2) + printPileup[tagIndex] += "\tDebug 2: ("+p+")\tMQ="+p.getMappingQual()+":QU="+p.getQual()+":RC="+p.getRepresentativeCount()+":OS="+p.getOffset()+"\n"; + } + } + if(debugLevel == 2){ + out.println(printPileup[originalQualsIndex]); + out.println(printPileup[reducedQualsIndex]); + } + return quals; + } + + // TODO -- arguments/variables should be final, not method declaration + private final boolean isGoodRead(PileupElement p, List tags){ + // TODO -- this isn't quite right. You don't need the tags here. Instead, you want to check whether the read itself (which + // TODO -- you can get from the PileupElement) is a reduced read (not all reads from the reduced bam are reduced) and only + // TODO -- for them do you want to ignore that min mapping quality cutoff (but you *do* still want the min base cutoff). + return !p.isDeletion() && (tags.contains(reduced) || (tags.contains(original) && (int)p.getQual() >= 20 && p.getMappingQual() >= 20)); + } + + private final int getTagIndex(List tags){ + return tags.contains(reduced) ? 1 : 0; + } + + @Override + public void onTraversalDone(GenomeLoc sum) { + if ( sum != null ) + out.println(sum); + } + + @Override + public GenomeLoc treeReduce(GenomeLoc lhs, GenomeLoc rhs) { + if ( lhs == null ) + return rhs; + + if ( rhs == null ) + return lhs; + + // if contiguous, just merge them + if ( lhs.contiguousP(rhs) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(lhs.getContig(), lhs.getStart(), rhs.getStop()); + + // otherwise, print the lhs and start over with the rhs + out.println(lhs); + return rhs; + } + + @Override + public GenomeLoc reduceInit() { + return null; + } + + @Override + public GenomeLoc reduce(GenomeLoc value, GenomeLoc sum) { + if ( value == null ) + return sum; + + if ( sum == null ) + return value; + + // if contiguous, just merge them + if ( sum.contiguousP(value) ) + return getToolkit().getGenomeLocParser().createGenomeLoc(sum.getContig(), sum.getStart(), value.getStop()); + + // otherwise, print the sum and start over with the value + out.println(sum); + return value; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java index 0c323934e..9954a25e8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountBases.java @@ -2,7 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -36,7 +36,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountBases extends ReadWalker { - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return read.getReadLength(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java index bd10eab87..cd295f26e 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountLoci.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.LocusWalker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -40,7 +41,7 @@ import java.io.PrintStream; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class CountLoci extends LocusWalker implements TreeReducible { +public class CountLoci extends LocusWalker implements TreeReducible, NanoSchedulable { @Output(doc="Write count to this file instead of STDOUT") PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java index bc178119d..f2e4cf1ad 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountMales.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.samples.Gender; import org.broadinstitute.sting.gatk.samples.Sample; import org.broadinstitute.sting.gatk.walkers.DataSource; @@ -41,7 +41,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountMales extends ReadWalker { - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { Sample sample = getSampleDB().getSample(read); return sample.getGender() == Gender.MALE ? 1 : 0; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java index 9915d617e..ab37a2322 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountRODs.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; @@ -73,7 +74,7 @@ import java.util.*; * */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class CountRODs extends RodWalker, Long>> implements TreeReducible, Long>> { +public class CountRODs extends RodWalker, Long>> implements TreeReducible, Long>>, NanoSchedulable { @Output public PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java index 80845c447..80afd19fa 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReadEvents.java @@ -4,7 +4,7 @@ import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; @@ -47,7 +47,7 @@ public class CountReadEvents extends ReadWalker> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Map> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return ReadUtils.getCigarOperatorForAllBases(read); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java index 5a9e5e7d2..1d2c6c9cc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountReads.java @@ -2,8 +2,9 @@ package org.broadinstitute.sting.gatk.walkers.qc; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; @@ -32,7 +33,6 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ * -T CountReads \ - * -o output.txt \ * -I input.bam \ * [-L input.intervals] * @@ -40,15 +40,11 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; */ @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) -public class CountReads extends ReadWalker { - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { - +public class CountReads extends ReadWalker implements NanoSchedulable { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { return 1; } - public Integer reduceInit() { return 0; } - - public Integer reduce(Integer value, Integer sum) { - return value + sum; - } + @Override public Integer reduceInit() { return 0; } + @Override public Integer reduce(Integer value, Integer sum) { return value + sum; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java index 971b5bb85..09d239126 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/CountTerminusEvent.java @@ -4,7 +4,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -41,7 +41,7 @@ import java.util.List; @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) @Requires({DataSource.READS, DataSource.REFERENCE}) public class CountTerminusEvent extends ReadWalker, Pair> { - public Pair map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Pair map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { List cigarElements = read.getCigar().getCigarElements(); CigarElement lastElement = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java index a3df3bc13..2039b7394 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ErrorThrowing.java @@ -24,13 +24,15 @@ package org.broadinstitute.sting.gatk.walkers.qc; +import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.RodWalker; +import org.broadinstitute.sting.gatk.walkers.NanoSchedulable; +import org.broadinstitute.sting.gatk.walkers.RefWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -41,24 +43,31 @@ import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; */ @Hidden @DocumentedGATKFeature( groupName = "Quality Control and Simple Analysis Tools", extraDocs = {CommandLineGATK.class} ) -public class ErrorThrowing extends RodWalker implements TreeReducible { +public class ErrorThrowing extends RefWalker implements TreeReducible, NanoSchedulable { @Input(fullName="exception", shortName = "E", doc="Java class of exception to throw", required=true) public String exceptionToThrow; + @Argument(fullName = "failMethod", shortName = "fail", doc = "Determines which method to fail in", required = false) + public FailMethod failMethod = FailMethod.MAP; + + public enum FailMethod { + MAP, + REDUCE, + TREE_REDUCE + } + // // Template code to allow us to build the walker, doesn't actually do anything // @Override public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { - if ( exceptionToThrow.equals("UserException") ) { - throw new UserException("UserException"); - } else if ( exceptionToThrow.equals("NullPointerException") ) { - throw new NullPointerException(); - } else if ( exceptionToThrow.equals("ReviewedStingException") ) { - throw new ReviewedStingException("ReviewedStingException"); - } else { - throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); - } + if ( ref == null ) // only throw exception when we are in proper map, not special map(null) call + return null; + + if ( failMethod == FailMethod.MAP ) + fail(); + + return 0; } @Override @@ -68,10 +77,34 @@ public class ErrorThrowing extends RodWalker implements TreeRed @Override public Integer reduce(Integer value, Integer sum) { - return value + sum; + if ( value != null && failMethod == FailMethod.REDUCE ) + fail(); + return sum; } public Integer treeReduce(final Integer lhs, final Integer rhs) { - return lhs + rhs; + if ( failMethod == FailMethod.TREE_REDUCE ) + fail(); + return rhs; + } + + private void fail() { + if ( exceptionToThrow.equals("UserException") ) { + throw new UserException("UserException"); + } else if ( exceptionToThrow.equals("NullPointerException") ) { + throw new NullPointerException(); + } else if ( exceptionToThrow.equals("ReviewedStingException") ) { + throw new ReviewedStingException("ReviewedStingException"); + } else if ( exceptionToThrow.equals("SamError1") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_1); + } else if ( exceptionToThrow.equals("SamError2") ) { + throw new RuntimeException(CommandLineGATK.PICARD_TEXT_SAM_FILE_ERROR_2); + } else if ( exceptionToThrow.equals("NoSpace1") ) { + throw new net.sf.samtools.util.RuntimeIOException(new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else if ( exceptionToThrow.equals("NoSpace2") ) { + throw new net.sf.samtools.SAMException("Exception writing BAM index file", new java.io.IOException("No space left on device java.io.FileOutputStream.writeBytes(Native Method)")); + } else { + throw new UserException.BadArgumentValue("exception", "exception isn't a recognized value " + exceptionToThrow); + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java index 16d614afc..ec4f081a6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/qc/ReadClippingStats.java @@ -29,7 +29,7 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Requires; @@ -75,7 +75,7 @@ public class ReadClippingStats extends ReadWalker samples; + private enum GVstatus { + T, F, NONE + } + public static class CountedData { private long nAltCalledAlt = 0L; private long nAltCalledRef = 0L; @@ -336,8 +333,9 @@ public class GenotypeAndValidate extends RodWalker 0 && context.getBasePileup().getBases().length < minDepth)) { counter.nUncovered = 1L; - if (vcComp.getAttribute("GV").equals("T")) + final GVstatus status = getGVstatus(vcComp); + if ( status == GVstatus.T ) counter.nAltNotCalled = 1L; - else if (vcComp.getAttribute("GV").equals("F")) + else if ( status == GVstatus.F ) counter.nRefNotCalled = 1L; else counter.nNoStatusNotCalled = 1L; @@ -427,10 +426,11 @@ public class GenotypeAndValidate extends RodWalker sm, double refLik) { super(sm); referenceLikelihood = refLik; @@ -45,15 +48,18 @@ public class GLBasedSampleSelector extends SampleSelector { // first subset to the samples VariantContext subContext = vc.subContextFromSamples(samples); + if ( ! subContext.isPolymorphicInSamples() ) + return false; + // now check to see (using EXACT model) whether this should be variant // do we want to apply a prior? maybe user-spec? if ( flatPriors == null ) { flatPriors = new double[1+2*samples.size()]; + AFCalculator = AFCalcFactory.createAFCalc(samples.size(), 4, 2); } - AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(vc.getAlternateAlleles().size()); - ExactAFCalculationModel.linearExactMultiAllelic(subContext.getGenotypes(),vc.getAlternateAlleles().size(),flatPriors,result); + final AFCalcResult result = AFCalculator.getLog10PNonRef(subContext, flatPriors); // do we want to let this qual go up or down? - if ( result.getLog10PosteriorOfAFzero() < referenceLikelihood ) { + if ( result.getLog10LikelihoodOfAFEq0() < referenceLikelihood ) { return true; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index 58cd14737..a73e125ad 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -120,13 +120,15 @@ public class VariantEval extends RodWalker implements TreeRedu /** * Some analyses want to count overlap not with dbSNP (which is in general very open) but * actually want to itemize their overlap specifically with a set of gold standard sites - * such as HapMap, OMNI, or the gold standard indels. Theis argument provides a mechanism + * such as HapMap, OMNI, or the gold standard indels. This argument provides a mechanism * for communicating which file to use */ @Input(fullName="goldStandard", shortName = "gold", doc="Evaluations that count calls at sites of true variation (e.g., indel calls) will use this argument as their gold standard for comparison", required=false) public RodBinding goldStandard = null; - // Help arguments + /** + * Note that the --list argument requires a fully resolved and correct command-line to work. + */ @Argument(fullName="list", shortName="ls", doc="List the available eval modules and exit", required=false) protected Boolean LIST = false; @@ -169,6 +171,9 @@ public class VariantEval extends RodWalker implements TreeRedu @Argument(shortName="mvq", fullName="mendelianViolationQualThreshold", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation. Default is 50.", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 50; + @Argument(shortName="ploidy", fullName="samplePloidy", doc="Per-sample ploidy (number of chromosomes per sample)", required=false) + protected int ploidy = VariantContextUtils.DEFAULT_PLOIDY; + @Argument(fullName="ancestralAlignments", shortName="aa", doc="Fasta file with ancestral alleles", required=false) private File ancestralAlignmentsFile = null; @@ -572,6 +577,7 @@ public class VariantEval extends RodWalker implements TreeRedu public double getMinPhaseQuality() { return MIN_PHASE_QUALITY; } + public int getSamplePloidy() { return ploidy; } public double getMendelianViolationQualThreshold() { return MENDELIAN_VIOLATION_QUAL_THRESHOLD; } public static String getAllSampleName() { return ALL_SAMPLE_NAME; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index 2b1bd9c62..e6efd4482 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -27,9 +27,9 @@ public class AlleleCount extends VariantStratifier { if ( getVariantEvalWalker().getEvals().size() != 1 && !getVariantEvalWalker().mergeEvals ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification only works with a single eval vcf"); - // There are 2 x n sample chromosomes for diploids + // There are ploidy x n sample chromosomes // TODO -- generalize to handle multiple ploidy - nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * 2; + nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy(); if ( nchrom < 2 ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample"); @@ -45,12 +45,22 @@ public class AlleleCount extends VariantStratifier { if (eval != null) { int AC = 0; // by default, the site is considered monomorphic - if ( eval.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) && eval.isBiallelic() ) { - // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) - AC = Math.min(eval.getAttributeAsInt(VCFConstants.MLE_ALLELE_COUNT_KEY, 0), nchrom); - } else if ( eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) && eval.isBiallelic() ) { - AC = eval.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); - } else if ( eval.isVariant() ) { + try { + if ( eval.isBiallelic() ) { + if ( eval.hasAttribute(VCFConstants.MLE_ALLELE_COUNT_KEY) ) { + // the MLEAC is allowed to be larger than the AN (e.g. in the case of all PLs being 0, the GT is ./. but the exact model may arbitrarily choose an AC>1) + AC = Math.min(eval.getAttributeAsInt(VCFConstants.MLE_ALLELE_COUNT_KEY, 0), nchrom); + } else if ( eval.hasAttribute(VCFConstants.ALLELE_COUNT_KEY) ) { + AC = eval.getAttributeAsInt(VCFConstants.ALLELE_COUNT_KEY, 0); + } + } + } catch ( ClassCastException e ) { + // protect ourselves from bad inputs + // TODO -- fully decode VC + } + + if ( AC == 0 && eval.isVariant() ) { + // fall back to the direct calculation for (Allele allele : eval.getAlternateAlleles()) AC = Math.max(AC, eval.getCalledChrCount(allele)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java index 011f3471c..158d1e78a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/ApplyRecalibration.java @@ -39,11 +39,11 @@ import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import java.io.File; import java.util.*; @@ -218,7 +218,7 @@ public class ApplyRecalibration extends RodWalker implements T String filterString = null; // Annotate the new record with its VQSLOD and the worst performing annotation - builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lodString); // use the String representation so that we don't lose precision on output + builder.attribute(VariantRecalibrator.VQS_LOD_KEY, lod); builder.attribute(VariantRecalibrator.CULPRIT_KEY, recalDatum.getAttribute(VariantRecalibrator.CULPRIT_KEY)); for( int i = tranches.size() - 1; i >= 0; i-- ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java index 9228dc375..042d4741d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/Tranche.java @@ -29,10 +29,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.XReadLines; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.PrintStream; +import java.io.*; import java.util.*; /** @@ -41,7 +38,7 @@ import java.util.*; * Date: Mar 10, 2011 */ -public class Tranche implements Comparable { +public class Tranche { private static final int CURRENT_VERSION = 5; public double ts, minVQSLod, knownTiTv, novelTiTv; @@ -83,10 +80,14 @@ public class Tranche implements Comparable { return accessibleTruthSites > 0 ? callsAtTruthSites / (1.0*accessibleTruthSites) : 0.0; } - public int compareTo(Tranche other) { - return Double.compare(this.ts, other.ts); + public static class TrancheTruthSensitivityComparator implements Comparator, Serializable { + @Override + public int compare(final Tranche tranche1, final Tranche tranche2) { + return Double.compare(tranche1.ts, tranche2.ts); + } } + @Override public String toString() { return String.format("Tranche ts=%.2f minVQSLod=%.4f known=(%d @ %.4f) novel=(%d @ %.4f) truthSites(%d accessible, %d called), name=%s]", ts, minVQSLod, numKnown, knownTiTv, numNovel, novelTiTv, accessibleTruthSites, callsAtTruthSites, name); @@ -102,7 +103,7 @@ public class Tranche implements Comparable { final ByteArrayOutputStream bytes = new ByteArrayOutputStream(); final PrintStream stream = new PrintStream(bytes); - Collections.sort(tranches); + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); stream.println("# Variant quality score tranches file"); stream.println("# Version number " + CURRENT_VERSION); @@ -183,7 +184,7 @@ public class Tranche implements Comparable { } } - Collections.sort(tranches); + Collections.sort( tranches, new TrancheTruthSensitivityComparator() ); return tranches; } catch( FileNotFoundException e ) { throw new UserException.CouldNotReadInputFile(f, e); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java index af0778399..58b4e4fc7 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/TrancheManager.java @@ -146,7 +146,7 @@ public class TrancheManager { public static List findTranches( final ArrayList data, final double[] trancheThresholds, final SelectionMetric metric, final VariantRecalibratorArgumentCollection.Mode model, final File debugFile ) { logger.info(String.format("Finding %d tranches for %d variants", trancheThresholds.length, data.size())); - Collections.sort(data); + Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); metric.calculateRunningMetric(data); if ( debugFile != null) { writeTranchesDebuggingInfo(debugFile, data, metric); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index e88505f99..aacd987d5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -158,7 +158,7 @@ public class VariantDataManager { logger.info( "Found " + numBadSitesAdded + " variants overlapping bad sites training tracks." ); // Next sort the variants by the LOD coming from the positive model and add to the list the bottom X percent of variants - Collections.sort( data ); + Collections.sort( data, new VariantDatum.VariantDatumLODComparator() ); final int numToAdd = Math.max( minimumNumber - trainingData.size(), Math.round((float)bottomPercentage * data.size()) ); if( numToAdd > data.size() ) { throw new UserException.BadInput( "Error during negative model training. Minimum number of variants to use in training is larger than the whole call set. One can attempt to lower the --minNumBadVariants arugment but this is unsafe." ); @@ -297,7 +297,7 @@ public class VariantDataManager { case SNP: return evalVC.isSNP() || evalVC.isMNP(); case INDEL: - return evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); + return evalVC.isStructuralIndel() || evalVC.isIndel() || evalVC.isMixed() || evalVC.isSymbolic(); case BOTH: return true; default: diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java index a85129d78..7b3b0d17d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDatum.java @@ -27,13 +27,16 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.utils.GenomeLoc; +import java.io.Serializable; +import java.util.Comparator; + /** * Created by IntelliJ IDEA. * User: rpoplin * Date: Mar 4, 2011 */ -public class VariantDatum implements Comparable { +public class VariantDatum { public double[] annotations; public boolean[] isNull; @@ -52,8 +55,10 @@ public class VariantDatum implements Comparable { public int worstAnnotation; public MultivariateGaussian assignment; // used in K-means implementation - @Override - public int compareTo( final VariantDatum other ) { - return Double.compare(this.lod, other.lod); + public static class VariantDatumLODComparator implements Comparator, Serializable { + @Override + public int compare(final VariantDatum datum1, final VariantDatum datum2) { + return Double.compare(datum1.lod, datum2.lod); + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java index 555999bdb..b1d8dc91d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariants.java @@ -313,7 +313,7 @@ public class CombineVariants extends RodWalker implements Tree VariantContextUtils.calculateChromosomeCounts(builder, false); if ( minimalVCF ) VariantContextUtils.pruneVariantContext(builder, Arrays.asList(SET_KEY)); - vcfWriter.add(VariantContextUtils.addMissingSamples(builder.make(), samples)); + vcfWriter.add(builder.make()); } return vcs.isEmpty() ? 0 : 1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java index f89bcb2a7..92d6e686b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/FilterLiftedVariants.java @@ -75,7 +75,7 @@ public class FilterLiftedVariants extends RodWalker { boolean failed = false; byte[] recordRef = vc.getReference().getBases(); for (int i = 0; i < recordRef.length && i < MAX_VARIANT_SIZE; i++) { - if ( recordRef[i] != ref[i + (vc.isPointEvent() ? 0 : 1)] ) { + if ( recordRef[i] != ref[i] ) { failed = true; break; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index bfd9aa52f..d1b7cb96f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -40,16 +40,16 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyper; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.utils.MendelianViolation; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.text.XReadLines; import org.broadinstitute.sting.utils.variantcontext.*; +import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import java.io.File; import java.io.FileNotFoundException; -import java.io.PrintStream; import java.util.*; /** @@ -265,7 +265,7 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false) private NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL; - @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false) + @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Store the original AC, AF, and AN values in the INFO field after selecting (using keys AC_Orig, AF_Orig, and AN_Orig)", required=false) private boolean KEEP_ORIGINAL_CHR_COUNTS = false; /** @@ -277,13 +277,6 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="mendelianViolationQualThreshold", shortName="mvq", doc="Minimum genotype QUAL score for each trio member required to accept a site as a violation", required=false) protected double MENDELIAN_VIOLATION_QUAL_THRESHOLD = 0; - /** - * Variants are kept in memory to guarantee that exactly n variants will be chosen randomly, so make sure you supply the program with enough memory - * given your input set. This option will NOT work well for large callsets; use --select_random_fraction for sets with a large numbers of variants. - */ - @Argument(fullName="select_random_number", shortName="number", doc="Selects a number of variants at random from the variant track", required=false) - protected int numRandom = 0; - /** * This routine is based on probability, so the final result is not guaranteed to carry the exact fraction. Can be used for large fractions. */ @@ -322,20 +315,12 @@ public class SelectVariants extends RodWalker implements TreeR @Argument(fullName="justRead", doc="If true, we won't actually write the output file. For efficiency testing only", required=false) private boolean justRead = false; + @Argument(doc="indel size select",required=false,fullName="maxIndelSize") + private int maxIndelSize = Integer.MAX_VALUE; - /* Private class used to store the intermediate variants in the integer random selection process */ - private static class RandomVariantStructure { - private VariantContext vc; + @Argument(doc="Allow a samples other than those in the VCF to be specified on the command line. These samples will be ignored.",required=false,fullName="ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES") + private boolean ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES = false; - RandomVariantStructure(VariantContext vcP) { - vc = vcP; - } - - public void set (VariantContext vcP) { - vc = vcP; - } - - } public enum NumberAlleleRestriction { ALL, @@ -357,12 +342,7 @@ public class SelectVariants extends RodWalker implements TreeR /* variables used by the SELECT RANDOM modules */ - private boolean SELECT_RANDOM_NUMBER = false; private boolean SELECT_RANDOM_FRACTION = false; - private int variantNumber = 0; - private int nVariantsAdded = 0; - private int positionToAdd = 0; - private RandomVariantStructure [] variantArray; //Random number generator for the genotypes to remove private Random randomGenotypes = new Random(); @@ -383,10 +363,31 @@ public class SelectVariants extends RodWalker implements TreeR Collection samplesFromFile = SampleUtils.getSamplesFromFiles(sampleFiles); Collection samplesFromExpressions = SampleUtils.matchSamplesExpressions(vcfSamples, sampleExpressions); - // first, add any requested samples - samples.addAll(samplesFromFile); - samples.addAll(samplesFromExpressions); + // first, check overlap between requested and present samples + Set commandLineUniqueSamples = new HashSet(samplesFromFile.size()+samplesFromExpressions.size()+sampleNames.size()); + commandLineUniqueSamples.addAll(samplesFromFile); + commandLineUniqueSamples.addAll(samplesFromExpressions); + commandLineUniqueSamples.addAll(sampleNames); + commandLineUniqueSamples.removeAll(vcfSamples); + + // second, add the requested samples samples.addAll(sampleNames); + samples.addAll(samplesFromExpressions); + samples.addAll(samplesFromFile); + + logger.debug(Utils.join(",",commandLineUniqueSamples)); + + if ( commandLineUniqueSamples.size() > 0 && ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES ) { + logger.warn("Samples present on command line input that are not present in the VCF. These samples will be ignored."); + samples.removeAll(commandLineUniqueSamples); + } else if (commandLineUniqueSamples.size() > 0 ) { + throw new UserException.BadInput(String.format("%s%n%n%s%n%n%s%n%n%s", + "Samples entered on command line (through -sf or -sn) that are not present in the VCF.", + "A list of these samples:", + Utils.join(",",commandLineUniqueSamples), + "To ignore these samples, run with --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES")); + } + // if none were requested, we want all of them if ( samples.isEmpty() ) { @@ -450,12 +451,6 @@ public class SelectVariants extends RodWalker implements TreeR mv = new MendelianViolation(MENDELIAN_VIOLATION_QUAL_THRESHOLD,false,true); } - SELECT_RANDOM_NUMBER = numRandom > 0; - if (SELECT_RANDOM_NUMBER) { - logger.info("Selecting " + numRandom + " variants at random from the variant track"); - variantArray = new RandomVariantStructure[numRandom]; - } - SELECT_RANDOM_FRACTION = fractionRandom > 0; if (SELECT_RANDOM_FRACTION) logger.info("Selecting approximately " + 100.0*fractionRandom + "% of the variants at random from the variant track"); @@ -464,7 +459,6 @@ public class SelectVariants extends RodWalker implements TreeR UAC.GLmodel = GenotypeLikelihoodsCalculationModel.Model.BOTH; UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_ALL_SITES; UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES; - UAC.NO_SLOD = true; UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); headerLines.addAll(UnifiedGenotyper.getHeaderInfo(UAC, null, null)); } @@ -541,12 +535,17 @@ public class SelectVariants extends RodWalker implements TreeR if (!selectedTypes.contains(vc.getType())) continue; + if ( badIndelSize(vc) ) + continue; + VariantContext sub = subsetRecord(vc, EXCLUDE_NON_VARIANTS); if ( REGENOTYPE && sub.isPolymorphicInSamples() && hasPLs(sub) ) { - final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(sub)).filters(sub.getFiltersMaybeNull()); - addAnnotations(builder, sub); - sub = builder.make(); + synchronized (UG_engine) { + final VariantContextBuilder builder = new VariantContextBuilder(UG_engine.calculateGenotypes(sub)).filters(sub.getFiltersMaybeNull()); + addAnnotations(builder, sub); + sub = builder.make(); + } } if ( (!EXCLUDE_NON_VARIANTS || sub.isPolymorphicInSamples()) && (!EXCLUDE_FILTERED || !sub.isFiltered()) ) { @@ -557,14 +556,10 @@ public class SelectVariants extends RodWalker implements TreeR break; } } - if ( !failedJexlMatch ) { - if (SELECT_RANDOM_NUMBER) { - randomlyAddVariant(++variantNumber, sub); - } - else if (!SELECT_RANDOM_FRACTION || ( GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom)) { - if ( ! justRead ) - vcfWriter.add(sub); - } + if ( !failedJexlMatch && + !justRead && + ( !SELECT_RANDOM_FRACTION || GenomeAnalysisEngine.getRandomGenerator().nextDouble() < fractionRandom ) ) { + vcfWriter.add(sub); } } } @@ -572,6 +567,18 @@ public class SelectVariants extends RodWalker implements TreeR return 1; } + private boolean badIndelSize(final VariantContext vc) { + List lengths = vc.getIndelLengths(); + if ( lengths == null ) + return false; // VC does not harbor indel + for ( Integer indelLength : vc.getIndelLengths() ) { + if ( indelLength > maxIndelSize ) + return true; + } + + return false; + } + private boolean hasPLs(final VariantContext vc) { for ( Genotype g : vc.getGenotypes() ) { if ( g.hasLikelihoods() ) @@ -675,14 +682,6 @@ public class SelectVariants extends RodWalker implements TreeR public void onTraversalDone(Integer result) { logger.info(result + " records processed."); - - if (SELECT_RANDOM_NUMBER) { - int positionToPrint = positionToAdd; - for (int i=0; i implements TreeR GenotypesContext newGC = sub.getGenotypes(); - // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs (because they are no longer accurate) + // if we have fewer alternate alleles in the selected VC than in the original VC, we need to strip out the GL/PLs and AD (because they are no longer accurate) if ( vc.getAlleles().size() != sub.getAlleles().size() ) - newGC = VariantContextUtils.stripPLs(sub.getGenotypes()); + newGC = VariantContextUtils.stripPLsAndAD(sub.getGenotypes()); // if we have fewer samples in the selected VC than in the original VC, we need to strip out the MLE tags if ( vc.getNSamples() != sub.getNSamples() ) { @@ -766,25 +765,4 @@ public class SelectVariants extends RodWalker implements TreeR if ( sawDP ) builder.attribute("DP", depth); } - - private void randomlyAddVariant(int rank, VariantContext vc) { - if (nVariantsAdded < numRandom) - variantArray[nVariantsAdded++] = new RandomVariantStructure(vc); - - else { - double v = GenomeAnalysisEngine.getRandomGenerator().nextDouble(); - double t = (1.0/(rank-numRandom+1)); - if ( v < t) { - variantArray[positionToAdd].set(vc); - nVariantsAdded++; - positionToAdd = nextCircularPosition(positionToAdd); - } - } - } - - private int nextCircularPosition(int cur) { - if ((cur + 1) == variantArray.length) - return 0; - return cur + 1; - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java index c92551a73..3e6ab050a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariants.java @@ -46,16 +46,20 @@ import java.util.Set; /** - * Strictly validates a variants file. + * Validates a VCF file with an extra strict set of criteria. * *

    * ValidateVariants is a GATK tool that takes a VCF file and validates much of the information inside it. - * Checks include the correctness of the reference base(s), accuracy of AC & AN values, tests against rsIDs - * when a dbSNP file is provided, and that all alternate alleles are present in at least one sample. + * In addition to standard adherence to the VCF specification, this tool performs extra checks to make ensure + * the information contained within the file is correct. Checks include the correctness of the reference base(s), + * accuracy of AC & AN values, tests against rsIDs when a dbSNP file is provided, and that all alternate alleles + * are present in at least one sample. + * + * If you are looking simply to test the adherence to the VCF specification, use --validationType NONE. * *

    Input

    *

    - * A variant set to filter. + * A variant set to validate. *

    * *

    Examples

    @@ -79,10 +83,9 @@ public class ValidateVariants extends RodWalker { protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection(); public enum ValidationType { - ALL, REF, IDS, ALLELES, CHR_COUNTS + ALL, REF, IDS, ALLELES, CHR_COUNTS, NONE } - @Hidden @Argument(fullName = "validationType", shortName = "type", doc = "which validation type to run", required = false) protected ValidationType type = ValidationType.ALL; @@ -172,7 +175,7 @@ public class ValidateVariants extends RodWalker { numErrors++; logger.warn("***** " + e.getMessage() + " *****"); } else { - throw new UserException.MalformedFile(file, e.getMessage()); + throw new UserException.FailsStrictValidation(file, e.getMessage()); } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java index b7ef85a04..4777b807f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPed.java @@ -27,9 +27,7 @@ import java.io.*; import java.util.*; /** - * Yet another VCF to Ped converter. The world actually does need one that will - * work efficiently on large VCFs (or at least give a progress bar). This - * produces a binary ped file in individual major mode. + * Converts a VCF file to a binary plink Ped file (.bed/.bim/.fam) */ @DocumentedGATKFeature( groupName = "Variant Evaluation and Manipulation Tools", extraDocs = {CommandLineGATK.class} ) @Reference(window=@Window(start=0,stop=100)) @@ -43,29 +41,33 @@ public class VariantsToBinaryPed extends RodWalker { /** * The metaData file can take two formats, the first of which is the first 6 lines of the standard ped file. This * is what Plink describes as a fam file. An example fam file is (note that there is no header): - * - * CEUTrio NA12878 NA12891 NA12892 2 -9 - * CEUTrio NA12891 UNKN1 UNKN2 2 -9 - * CEUTrio NA12892 UNKN3 UNKN4 1 -9 - * + *

    + * CEUTrio NA12878 NA12891 NA12892 2 -9

    + * CEUTrio NA12891 UNKN1 UNKN2 2 -9

    + * CEUTrio NA12892 UNKN3 UNKN4 1 -9

    + *

    * where the entries are (FamilyID IndividualID DadID MomID Phenotype Sex) - * + *

    * An alternate format is a two-column key-value file - * - * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9 - * NA12891 fid=CEUTrio;sex=2;phenotype=-9 - * NA12892 fid=CEUTrio;sex=1;phenotype=-9 - * + *

    + * NA12878 fid=CEUTrio;dad=NA12891;mom=NA12892;sex=2;phenotype=-9

    + * NA12891 fid=CEUTrio;sex=2;phenotype=-9

    + * NA12892 fid=CEUTrio;sex=1;phenotype=-9

    + *

    * wherein unknown parents needn't be specified. The columns are the individual ID, and a list of key-value pairs. - * + *

    * Regardless of which file is specified, the walker will output a .fam file alongside the bed file. If the * command line has "-md [name].fam", the fam file will simply be copied. However, if a metadata file of the * alternate format is passed by "-md [name].txt", the walker will construct a formatted .fam file from the data. + *

    */ @Input(shortName="m",fullName = "metaData",required=true,doc="Sample metadata file. You may specify a .fam file " + "(in which case it will be copied to the file you provide as fam output).") File metaDataFile; + @Input(shortName="mode",fullName="outputMode",required=false,doc="The output file mode (SNP major or individual major)") + OutputMode mode = OutputMode.INDIVIDUAL_MAJOR; + @Output(shortName="bed",fullName = "bed",required=true,doc="output ped file") PrintStream outBed; @@ -81,6 +83,8 @@ public class VariantsToBinaryPed extends RodWalker { @Argument(fullName="majorAlleleFirst",required=false,doc="Sets the major allele to be 'reference' for the bim file, rather than the ref allele") boolean majorAlleleFirst = false; + enum OutputMode { INDIVIDUAL_MAJOR,SNP_MAJOR } + private static double APPROX_CM_PER_BP = 1000000.0/750000.0; private static final byte HOM_REF = 0x0; @@ -102,6 +106,8 @@ public class VariantsToBinaryPed extends RodWalker { private int genotypeCount = 0; private int byteCount = 0; private List famOrder = new ArrayList(); + private long totalByteCount = 0l; + private long totalGenotypeCount = 0l; public void initialize() { writeBedHeader(); @@ -138,14 +144,18 @@ public class VariantsToBinaryPed extends RodWalker { throw new UserException("No metadata provided for sample "+sample); } } - try { - File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); - printMap.put(sample,new PrintStream(temp)); - tempFiles.put(sample,temp); - } catch (IOException e) { - throw new ReviewedStingException("Error creating temporary file",e); + if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { + // only need to instantiate the files and buffers if in individual major. + // Cut down on memory. + try { + File temp = File.createTempFile("VariantsToBPed_"+sample, ".tmp"); + printMap.put(sample,new PrintStream(temp)); + tempFiles.put(sample,temp); + } catch (IOException e) { + throw new ReviewedStingException("Error creating temporary file",e); + } + genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); } - genotypeBuffer.put(sample,new byte[BUFFER_SIZE]); famOrder.add(sample); } } @@ -195,13 +205,26 @@ public class VariantsToBinaryPed extends RodWalker { // write an entry into the map file outBim.printf("%s\t%s\t%.2f\t%d\t%s\t%s%n",vc.getChr(),getID(vc),APPROX_CM_PER_BP*vc.getStart(),vc.getStart(), refOut,altOut); + if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { + writeIndividualMajor(vc,altMajor); + } else { + writeSNPMajor(vc,altMajor); + } + + + return 1; + } + + public void writeIndividualMajor(VariantContext vc, boolean altMajor) { // store genotypes per sample into the buffer for ( Genotype g : vc.getGenotypes() ) { + ++totalGenotypeCount; String sample = g.getSampleName(); byte[] samBuf = genotypeBuffer.get(sample); byte enc = getEncoding(g,genotypeCount,altMajor); samBuf[byteCount] |= enc; } + genotypeCount++; if ( genotypeCount % 4 == 0 ) { byteCount++; @@ -222,8 +245,30 @@ public class VariantsToBinaryPed extends RodWalker { } genotypeCount = 0; } + } - return 1; + public void writeSNPMajor(VariantContext vc, boolean altMajor) { + // for each sample, write the genotype into the bed file, in the + // order of the fam file + genotypeCount = 0; + byteCount = 0; + byte[] bytes = new byte[(3+famOrder.size())/4]; // this exploits java integer fractions, which round down by default (1-4) -> 1, (5-8) -> 2 + for ( Genotype g : vc.getGenotypesOrderedBy(famOrder) ) { + byte enc = getEncoding(g,genotypeCount,altMajor); + bytes[byteCount] |= enc; + genotypeCount++; + if ( genotypeCount % 4 == 0 ) { + byteCount++; + genotypeCount = 0; + } + } + totalGenotypeCount += famOrder.size(); + totalByteCount += bytes.length; + try { + outBed.write(bytes); + } catch (IOException e) { + throw new ReviewedStingException("Error writing to output bed file",e); + } } public Integer reduce(Integer m, Integer r) { @@ -235,7 +280,15 @@ public class VariantsToBinaryPed extends RodWalker { } public void onTraversalDone(Integer numSites) { - logger.info(String.format("%d sites processed!",numSites)); + logger.info(String.format("%d sites processed for a total of %d genotypes encoded in %d bytes",numSites,totalGenotypeCount,totalByteCount)); + + if ( mode == OutputMode.INDIVIDUAL_MAJOR ) { + mergeGenotypeTempFiles(numSites); + } + + } + + private void mergeGenotypeTempFiles(int numSites) { // push out the remaining genotypes and close stream for ( String sample : printMap.keySet() ) { try { @@ -267,18 +320,19 @@ public class VariantsToBinaryPed extends RodWalker { byte[] readGenotypes = new byte[BUFFER_SIZE]; inStream.read(readGenotypes); outBed.write(readGenotypes); + totalByteCount += BUFFER_SIZE; } if ( ttr > 0 ) { byte[] readGenotypes = new byte[ttr]; inStream.read(readGenotypes); outBed.write(readGenotypes); + totalByteCount += ttr; } inStream.close(); } catch (IOException e) { throw new ReviewedStingException("Error reading form temp file for input.",e); } } - } private byte getEncoding(Genotype g, int offset, boolean altMajor) { @@ -331,7 +385,7 @@ public class VariantsToBinaryPed extends RodWalker { return MathUtils.log10ProbabilityToPhredScale(log10gq) >= minGenotypeQuality; } - return false; + return minGenotypeQuality <= 0; } private static String getID(VariantContext v) { @@ -355,7 +409,7 @@ public class VariantsToBinaryPed extends RodWalker { private void writeBedHeader() { // write magic bits into the ped file try { - outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, 0x0}); + outBed.write(new byte[] { (byte) 0x6c, (byte) 0x1b, (byte) (mode == OutputMode.INDIVIDUAL_MAJOR ? 0x0 : 0x1)}); // ultimately, the bed will be in individual-major mode } catch (IOException e) { throw new ReviewedStingException("error writing to output file."); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index b9577ca9b..dd5264a1b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -42,6 +42,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; import java.io.PrintStream; +import java.lang.reflect.Array; import java.util.*; /** @@ -334,12 +335,12 @@ public class VariantsToTable extends RodWalker { return records; } - private static void addFieldValue(Object val, List> result) { + private static void addFieldValue(final Object val, final List> result) { final int numResultRecords = result.size(); // if we're trying to create a single output record, add it if ( numResultRecords == 1 ) { - result.get(0).add(val.toString()); + result.get(0).add(prettyPrintObject(val)); } // if this field is a list of the proper size, add the appropriate entry to each record else if ( (val instanceof List) && ((List)val).size() == numResultRecords ) { @@ -355,6 +356,26 @@ public class VariantsToTable extends RodWalker { } } + private static String prettyPrintObject(final Object val) { + if ( val instanceof List ) + return prettyPrintObject(((List)val).toArray()); + + if ( !val.getClass().isArray() ) + return val.toString(); + + final int length = Array.getLength(val); + if ( length == 0 ) + return ""; + + final StringBuilder sb = new StringBuilder(prettyPrintObject(Array.get(val, 0))); + for ( int i = 1; i < length; i++ ) { + sb.append(","); + sb.append(prettyPrintObject(Array.get(val, i))); + } + return sb.toString(); + } + + public static List> extractFields(VariantContext vc, List fields, boolean allowMissingData) { return extractFields(vc, fields, null, null, allowMissingData, false); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 78c9c4a1c..059e9c5fb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -160,7 +160,7 @@ public class VariantsToVCF extends RodWalker { Map alleleMap = new HashMap(2); alleleMap.put(RawHapMapFeature.DELETION, Allele.create(ref.getBase(), dbsnpVC.isSimpleInsertion())); - alleleMap.put(RawHapMapFeature.INSERTION, Allele.create(ref.getBase() + ((RawHapMapFeature)record).getAlleles()[1], !dbsnpVC.isSimpleInsertion())); + alleleMap.put(RawHapMapFeature.INSERTION, Allele.create((char)ref.getBase() + ((RawHapMapFeature)record).getAlleles()[1], !dbsnpVC.isSimpleInsertion())); hapmap.setActualAlleles(alleleMap); // also, use the correct positioning for insertions @@ -246,7 +246,6 @@ public class VariantsToVCF extends RodWalker { } vc = VariantContextUtils.purgeUnallowedGenotypeAttributes(vc, allowedGenotypeFormatStrings); - vc = VariantContextUtils.addMissingSamples(vc, samples); vcfwriter.add(vc); } diff --git a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java index 368cb3d11..2226c6458 100644 --- a/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java +++ b/public/java/src/org/broadinstitute/sting/queue/extensions/gatk/ArgumentDefinitionField.java @@ -346,7 +346,7 @@ public abstract class ArgumentDefinitionField extends ArgumentField { @Override protected String getFreezeFields() { - return String.format("if (num_threads.isDefined) nCoresRequest = num_threads%n"); + return String.format("if (num_threads.isDefined) nCoresRequest = num_threads%nif (num_cpu_threads_per_data_thread.isDefined) nCoresRequest = Some(nCoresRequest.getOrElse(1) * num_cpu_threads_per_data_thread.getOrElse(1))%n"); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java b/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java index 8964c16cb..4455666e8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java +++ b/public/java/src/org/broadinstitute/sting/utils/AutoFormattingTime.java @@ -4,12 +4,21 @@ package org.broadinstitute.sting.utils; * Simple utility class that makes it convenient to print unit adjusted times */ public class AutoFormattingTime { - double timeInSeconds; // in Seconds - int precision; // for format + private final int width; // for format + private final int precision; // for format - public AutoFormattingTime(double timeInSeconds, int precision) { + double timeInSeconds; // in Seconds + private final String formatString; + + public AutoFormattingTime(double timeInSeconds, final int width, int precision) { + this.width = width; this.timeInSeconds = timeInSeconds; this.precision = precision; + this.formatString = "%" + width + "." + precision + "f %s"; + } + + public AutoFormattingTime(double timeInSeconds, int precision) { + this(timeInSeconds, 6, precision); } public AutoFormattingTime(double timeInSeconds) { @@ -20,6 +29,20 @@ public class AutoFormattingTime { return timeInSeconds; } + /** + * @return the precision (a la format's %WIDTH.PERCISIONf) + */ + public int getWidth() { + return width; + } + + /** + * @return the precision (a la format's %WIDTH.PERCISIONf) + */ + public int getPrecision() { + return precision; + } + /** * Instead of 10000 s, returns 2.8 hours * @return @@ -48,6 +71,6 @@ public class AutoFormattingTime { } } - return String.format("%6."+precision+"f %s", unitTime, unit); + return String.format(formatString, unitTime, unit); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 2d7f51c3f..69920ece4 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -1,6 +1,8 @@ package org.broadinstitute.sting.utils; +import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; @@ -197,7 +199,9 @@ public class BaseUtils { * @param base [AaCcGgTt] * @return 0, 1, 2, 3, or -1 if the base can't be understood */ - static public int simpleBaseToBaseIndex(byte base) { + static public int simpleBaseToBaseIndex(final byte base) { + if ( base < 0 || base >= 256 ) + throw new UserException.BadInput("Non-standard bases were encountered in either the input reference or BAM file(s)"); return baseIndexMap[base]; } @@ -444,29 +448,8 @@ public class BaseUtils { * @param bases the bases * @return the upper cased version */ - static public byte[] convertToUpperCase(final byte[] bases) { - for ( int i = 0; i < bases.length; i++ ) { - if ( (char)bases[i] >= 'a' ) - bases[i] = toUpperCaseBase(bases[i]); - } - return bases; - } - - static public byte toUpperCaseBase(final byte base) { - switch (base) { - case 'a': - return 'A'; - case 'c': - return 'C'; - case 'g': - return 'G'; - case 't': - return 'T'; - case 'n': - return 'N'; - default: - return base; - } + static public void convertToUpperCase(final byte[] bases) { + StringUtil.toUpperCase(bases); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index f8faa101b..6df9c9f1d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -125,6 +125,15 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return ! discontinuousP( that ); } + /** + * Return true if this GenomeLoc represents the UNMAPPED location + * @return + */ + public final boolean isUnmapped() { + return isUnmapped(this); + } + + /** * Returns a new GenomeLoc that represents the entire span of this and that. Requires that * this and that GenomeLoc are contiguous and both mapped @@ -418,7 +427,10 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome result = cmpContig; } else { if ( this.getStart() < that.getStart() ) result = -1; - if ( this.getStart() > that.getStart() ) result = 1; + else if ( this.getStart() > that.getStart() ) result = 1; + // these have the same start, so check the ends + else if ( this.getStop() < that.getStop() ) result = -1; + else if ( this.getStop() > that.getStop() ) result = 1; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index 77ecd295f..a3ffe708c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.utils; import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import com.google.java.contract.ThrowEnsures; import net.sf.picard.reference.ReferenceSequenceFile; @@ -70,7 +69,6 @@ public final class GenomeLocParser { private CachingSequenceDictionary getContigInfo() { if ( contigInfoPerThread.get() == null ) { // initialize for this thread - logger.debug("Creating thread-local caching sequence dictionary for thread " + Thread.currentThread().getName()); contigInfoPerThread.set(new CachingSequenceDictionary(SINGLE_MASTER_SEQUENCE_DICTIONARY)); } diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index fcde1f419..b30d47074 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -34,6 +34,7 @@ import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.VariantContext; +import java.io.Serializable; import java.util.*; public class Haplotype { @@ -55,22 +56,22 @@ public class Haplotype { * @param bases bases * @param qual qual */ - public Haplotype(byte[] bases, int qual) { - this.bases = bases; + public Haplotype( final byte[] bases, final int qual ) { + this.bases = bases.clone(); quals = new double[bases.length]; Arrays.fill(quals, (double)qual); } - public Haplotype(byte[] bases, double[] quals) { - this.bases = bases; - this.quals = quals; + public Haplotype( final byte[] bases, final double[] quals ) { + this.bases = bases.clone(); + this.quals = quals.clone(); } - public Haplotype(byte[] bases) { + public Haplotype( final byte[] bases ) { this(bases, 0); } - public Haplotype(byte[] bases, GenomeLoc loc) { + public Haplotype( final byte[] bases, final GenomeLoc loc ) { this(bases); this.genomeLocation = loc; } @@ -140,10 +141,10 @@ public class Haplotype { } public double[] getQuals() { - return quals; + return quals.clone(); } public byte[] getBases() { - return bases; + return bases.clone(); } public long getStartPosition() { @@ -184,6 +185,21 @@ public class Haplotype { return new Haplotype(newHaplotypeBases); } + public static class HaplotypeBaseComparator implements Comparator, Serializable { + @Override + public int compare( final Haplotype hap1, final Haplotype hap2 ) { + final byte[] arr1 = hap1.getBases(); + final byte[] arr2 = hap2.getBases(); + // compares byte arrays using lexical ordering + final int len = Math.min(arr1.length, arr2.length); + for( int iii = 0; iii < len; iii++ ) { + final int cmp = arr1[iii] - arr2[iii]; + if (cmp != 0) { return cmp; } + } + return arr2.length - arr1.length; + } + } + public static LinkedHashMap makeHaplotypeListFromAlleles(final List alleleList, final int startPos, final ReferenceContext ref, diff --git a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java index d1bc75583..601f90b4d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java +++ b/public/java/src/org/broadinstitute/sting/utils/MannWhitneyU.java @@ -11,6 +11,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.StingException; +import java.io.Serializable; import java.util.Comparator; import java.util.TreeSet; @@ -29,16 +30,26 @@ public class MannWhitneyU { private int sizeSet2; private ExactMode exactMode; - public MannWhitneyU() { - observations = new TreeSet>(new DitheringComparator()); + public MannWhitneyU(ExactMode mode, boolean dither) { + if ( dither ) + observations = new TreeSet>(new DitheringComparator()); + else + observations = new TreeSet>(new NumberedPairComparator()); sizeSet1 = 0; sizeSet2 = 0; - exactMode = ExactMode.POINT; + exactMode = mode; + } + + public MannWhitneyU() { + this(ExactMode.POINT,true); + } + + public MannWhitneyU(boolean dither) { + this(ExactMode.POINT,dither); } public MannWhitneyU(ExactMode mode) { - super(); - exactMode = mode; + this(mode,true); } /** @@ -434,12 +445,14 @@ public class MannWhitneyU { * A comparator class which uses dithering on tie-breaking to ensure that the internal treeset drops no values * and to ensure that rank ties are broken at random. */ - private class DitheringComparator implements Comparator> { + private static class DitheringComparator implements Comparator>, Serializable { public DitheringComparator() {} + @Override public boolean equals(Object other) { return false; } + @Override public int compare(Pair left, Pair right) { double comp = Double.compare(left.first.doubleValue(),right.first.doubleValue()); if ( comp > 0 ) { return 1; } @@ -448,6 +461,22 @@ public class MannWhitneyU { } } + /** + * A comparator that reaches into the pair and compares numbers without tie-braking. + */ + private static class NumberedPairComparator implements Comparator>, Serializable { + + public NumberedPairComparator() {} + + @Override + public boolean equals(Object other) { return false; } + + @Override + public int compare(Pair left, Pair right ) { + return Double.compare(left.first.doubleValue(),right.first.doubleValue()); + } + } + public enum USet { SET1, SET2 } public enum ExactMode { POINT, CUMULATIVE } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 7d1561fc5..ff153a85c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -51,13 +51,19 @@ public class MathUtils { public static final double[] log10Cache; public static final double[] log10FactorialCache; private static final double[] jacobianLogTable; - private static final double JACOBIAN_LOG_TABLE_STEP = 0.001; - private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / 0.001; + private static final double JACOBIAN_LOG_TABLE_STEP = 0.0001; + private static final double JACOBIAN_LOG_TABLE_INV_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; private static final double MAX_JACOBIAN_TOLERANCE = 8.0; private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; private static final int MAXN = 50000; private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients + /** + * The smallest log10 value we'll emit from normalizeFromLog10 and other functions + * where the real-space value is 0.0. + */ + public final static double LOG10_P_OF_ZERO = -1000000.0; + static { log10Cache = new double[LOG10_CACHE_SIZE]; log10FactorialCache = new double[LOG10_CACHE_SIZE]; @@ -75,6 +81,17 @@ public class MathUtils { } } + /** + * Get a random int between min and max (inclusive) using the global GATK random number generator + * + * @param min lower bound of the range + * @param max upper bound of the range + * @return a random int >= min and <= max + */ + public static int randomIntegerInRange( int min, int max ) { + return GenomeAnalysisEngine.getRandomGenerator().nextInt(max - min + 1) + min; + } + // A fast implementation of the Math.round() method. This method does not perform // under/overflow checking, so this shouldn't be used in the general case (but is fine // if one is already make those checks before calling in to the rounding). @@ -561,16 +578,25 @@ public class MathUtils { return normalizeFromLog10(array, takeLog10OfOutput, false); } + /** + * See #normalizeFromLog10 but with the additional option to use an approximation that keeps the calculation always in log-space + * + * @param array + * @param takeLog10OfOutput + * @param keepInLogSpace + * + * @return + */ public static double[] normalizeFromLog10(double[] array, boolean takeLog10OfOutput, boolean keepInLogSpace) { - // for precision purposes, we need to add (or really subtract, since they're // all negative) the largest value; also, we need to convert to normal-space. double maxValue = arrayMax(array); // we may decide to just normalize in log space without converting to linear space if (keepInLogSpace) { - for (int i = 0; i < array.length; i++) + for (int i = 0; i < array.length; i++) { array[i] -= maxValue; + } return array; } @@ -586,8 +612,12 @@ public class MathUtils { sum += normalized[i]; for (int i = 0; i < array.length; i++) { double x = normalized[i] / sum; - if (takeLog10OfOutput) + if (takeLog10OfOutput) { x = Math.log10(x); + if ( x < LOG10_P_OF_ZERO || Double.isInfinite(x) ) + x = array[i] - maxValue; + } + normalized[i] = x; } @@ -625,6 +655,10 @@ public class MathUtils { return maxElementIndex(array, array.length); } + public static int maxElementIndex(final byte[] array) { + return maxElementIndex(array, array.length); + } + public static int maxElementIndex(final int[] array, int endIndex) { if (array == null || array.length == 0) throw new IllegalArgumentException("Array cannot be null!"); @@ -638,6 +672,24 @@ public class MathUtils { return maxI; } + public static int maxElementIndex(final byte[] array, int endIndex) { + if (array == null || array.length == 0) + throw new IllegalArgumentException("Array cannot be null!"); + + int maxI = 0; + for (int i = 1; i < endIndex; i++) { + if (array[i] > array[maxI]) + maxI = i; + } + + return maxI; + } + + public static byte arrayMax(final byte[] array) { + return array[maxElementIndex(array)]; + } + + public static double arrayMax(final double[] array) { return array[maxElementIndex(array)]; } @@ -1142,6 +1194,39 @@ public class MathUtils { return getQScoreOrderStatistic(reads, offsets, (int) Math.floor(reads.size() / 2.)); } + /** + * Check that the log10 prob vector vector is well formed + * + * @param vector + * @param expectedSize + * @param shouldSumToOne + * + * @return true if vector is well-formed, false otherwise + */ + public static boolean goodLog10ProbVector(final double[] vector, final int expectedSize, final boolean shouldSumToOne) { + if ( vector.length != expectedSize ) return false; + + for ( final double pr : vector ) { + if ( ! goodLog10Probability(pr) ) + return false; + } + + if ( shouldSumToOne && compareDoubles(sumLog10(vector), 1.0, 1e-4) != 0 ) + return false; + + return true; // everything is good + } + + /** + * Checks that the result is a well-formed log10 probability + * + * @param result a supposedly well-formed log10 probability value + * @return true if result is really well formed + */ + public static boolean goodLog10Probability(final double result) { + return result <= 0.0 && ! Double.isInfinite(result) && ! Double.isNaN(result); + } + /** * A utility class that computes on the fly average and standard deviation for a stream of numbers. * The number of observations does not have to be known in advance, and can be also very big (so that @@ -1634,4 +1719,35 @@ public class MathUtils { } + /** + * Returns a series of integer values between start and stop, inclusive, + * expontentially distributed between the two. That is, if there are + * ten values between 0-10 there will be 10 between 10-100. + * + * WARNING -- BADLY TESTED + * @param start + * @param stop + * @param eps + * @return + */ + public static List log10LinearRange(final int start, final int stop, final double eps) { + final LinkedList values = new LinkedList(); + final double log10range = Math.log10(stop - start); + + if ( start == 0 ) + values.add(0); + + double i = 0.0; + while ( i <= log10range ) { + final int index = (int)Math.round(Math.pow(10, i)) + start; + if ( index < stop && (values.peekLast() == null || values.peekLast() != index ) ) + values.add(index); + i += eps; + } + + if ( values.peekLast() == null || values.peekLast() != stop ) + values.add(stop); + + return values; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java b/public/java/src/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java new file mode 100644 index 000000000..98900031a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/MultiThreadedErrorTracker.java @@ -0,0 +1,80 @@ +package org.broadinstitute.sting.utils; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +/** + * A utility to track exceptions that occur across threads. + * + * Uses a notify mechanism so that multiple threads can tell the tracker that an + * error has occurred, and a master thread can monitor this object for an error + * occurring and take appropriate action. Only maintains the first + * error to reach the tracker. + * + * Refactored from HierarchicalMicroScheduler + * + * User: depristo + * Date: 9/19/12 + * Time: 11:20 AM + */ +public class MultiThreadedErrorTracker { + /** + * An exception that's occurred. If null, no exception has occurred. + */ + private RuntimeException error = null; + + /** + * Convenience function to check, and throw, an error is one is pending + */ + public synchronized void throwErrorIfPending() { + if (hasAnErrorOccurred()) + throw getError(); + } + + /** + * Detects whether an execution error has occurred. + * @return True if an error has occurred. False otherwise. + */ + public synchronized boolean hasAnErrorOccurred() { + return error != null; + } + + /** + * Retrieve the error that has occurred. + * + * @throws ReviewedStingException if no error has occurred. + * @return + */ + public synchronized RuntimeException getError() { + if(!hasAnErrorOccurred()) + throw new ReviewedStingException("User has attempted to retrieve a traversal error when none exists"); + return error; + } + + /** + * Notify this error tracker that an error has occurs. Only updates the tracked + * error if it is currently null (i.e., no error has been already reported). So + * calling this successively with multiple errors only keeps the first, which is the + * right thing to do as the initial failure is usually the meaningful one, but + * generates a cascade of failures as other subsystems fail. + */ + public synchronized RuntimeException notifyOfError(Throwable error) { + if ( this.error == null ) + this.error = toRuntimeException(error); + + return this.error; + } + + /** + * Convert error to a Runtime exception, or keep as is if it already is one + * + * @param error the error that has occurred + * @return the potentially converted error + */ + private RuntimeException toRuntimeException(final Throwable error) { + // If the error is already a Runtime, pass it along as is. Otherwise, wrap it. + if (error instanceof RuntimeException) + return (RuntimeException)error; + else + return new ReviewedStingException("An error occurred during the traversal. Message=" + error.getMessage(), error); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/PairHMM.java deleted file mode 100644 index 15f7a7869..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/PairHMM.java +++ /dev/null @@ -1,259 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.utils; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; - -import java.util.*; - -/** - * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. - * User: rpoplin - * Date: 3/1/12 - */ - -public class PairHMM { - private static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; - private static final byte DEFAULT_GOP = (byte) 45; - private static final byte DEFAULT_GCP = (byte) 10; - private static final double BANDING_TOLERANCE = 22.0; - private static final int BANDING_CLUSTER_WINDOW = 12; - private final boolean noBanded; - - public PairHMM() { - noBanded = false; - } - - public PairHMM( final boolean noBanded ) { - this.noBanded = noBanded; - } - - - public static void initializeArrays(final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray, - final int X_METRIC_LENGTH) { - - for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { - Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); - Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); - } - - // the initial condition - matchMetricArray[1][1] = 0.0; // Math.log10(1.0); - - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // initial arrays to hold the probabilities of being in the match, insertion and deletion cases - final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; - - initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH); - - return computeReadLikelihoodGivenHaplotype(haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, 0, matchMetricArray, XMetricArray, YMetricArray); - } - - @Requires({"readBases.length == readQuals.length","readBases.length == insertionGOP.length","readBases.length == deletionGOP.length","readBases.length == overallGCP.length"}) - @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 probability - public double computeReadLikelihoodGivenHaplotype( final byte[] haplotypeBases, final byte[] readBases, final byte[] readQuals, - final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, final int hapStartIndex, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment - final int X_METRIC_LENGTH = readBases.length + 2; - final int Y_METRIC_LENGTH = haplotypeBases.length + 2; - - // ensure that all the qual scores have valid values - for( int iii = 0; iii < readQuals.length; iii++ ) { - readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); - } - - if( false ) { - final ArrayList workQueue = new ArrayList(); // holds a queue of starting work location (indices along the diagonal). Will be sorted each step - final ArrayList workToBeAdded = new ArrayList(); - final ArrayList calculatedValues = new ArrayList(); - final int numDiags = X_METRIC_LENGTH + Y_METRIC_LENGTH - 1; - workQueue.add( 1 ); // Always start a new thread at the baseline because of partially repeating sequences that match better in the latter half of the haplotype - - for(int diag = 3; diag < numDiags; diag++) { // diag = 3 is the (1,2) element of the metric arrays. (1,1) is the initial condition and is purposefully skipped over - //Collections.sort(workQueue); // no need to sort because elements are guaranteed to be in ascending order - int el = 1; - for( int work : workQueue ) { - // choose the appropriate diagonal baseline location - int iii = 0; - int jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work; - jjj -= work; - while( iii >= X_METRIC_LENGTH || jjj <= 0 ) { - iii--; - jjj++; - work--; - } - if( !detectClusteredStartLocations(workToBeAdded, work ) ) { - workToBeAdded.add(work); // keep this thread going once it has started - } - - if( work >= el - 3 ) { - // step along the diagonal in the forward direction, updating the match matrices and looking for a drop off from the maximum observed value - double maxElement = Double.NEGATIVE_INFINITY; - for( el = work; el < numDiags + 1; el++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - calculatedValues.add(bestMetric); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - if( ++iii >= X_METRIC_LENGTH ) { // don't walk off the edge of the matrix - break; - } - if( --jjj <= 0 ) { // don't walk off the edge of the matrix - break; - } - } - - // find a local maximum to start a new band in the work queue - double localMaxElement = Double.NEGATIVE_INFINITY; - int localMaxElementIndex = 0; - for(int kkk = calculatedValues.size()-1; kkk >= 1; kkk--) { - final double bestMetric = calculatedValues.get(kkk); - if( bestMetric > localMaxElement ) { - localMaxElement = bestMetric; - localMaxElementIndex = kkk; - } else if( localMaxElement - bestMetric > BANDING_TOLERANCE * 0.5 ) { // find a local maximum - if( !detectClusteredStartLocations(workToBeAdded, work + localMaxElementIndex ) ) { - workToBeAdded.add( work + localMaxElementIndex ); - } - break; - } - } - calculatedValues.clear(); - - // reset iii and jjj to the appropriate diagonal baseline location - iii = 0; - jjj = diag; - if( diag > Y_METRIC_LENGTH ) { - iii = diag - Y_METRIC_LENGTH; - jjj = Y_METRIC_LENGTH; - } - // move to the starting work location along the diagonal - iii += work-1; - jjj -= work-1; - - // step along the diagonal in the reverse direction, updating the match matrices and looking for a drop off from the maximum observed value - for( int traceBack = work - 1; traceBack > 0 && iii > 0 && jjj < Y_METRIC_LENGTH; traceBack--,iii--,jjj++ ) { - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, - insertionGOP, deletionGOP, overallGCP, matchMetricArray, XMetricArray, YMetricArray); - final double bestMetric = MathUtils.max(matchMetricArray[iii][jjj], XMetricArray[iii][jjj], YMetricArray[iii][jjj]); - if( bestMetric > maxElement ) { - maxElement = bestMetric; - } else if( maxElement - bestMetric > BANDING_TOLERANCE ) { - break; - } - } - } - } - workQueue.clear(); - workQueue.addAll(workToBeAdded); - workToBeAdded.clear(); - } - } else { - // simple rectangular version of update loop, slow - for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { - for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { - if( (iii == 1 && jjj == 1) ) { continue; } - updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, - matchMetricArray, XMetricArray, YMetricArray); - } - } - } - - // final probability is the log10 sum of the last element in all three state arrays - final int endI = X_METRIC_LENGTH - 1; - final int endJ = Y_METRIC_LENGTH - 1; - return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); - } - - private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, - final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, - final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { - - // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions - final int im1 = indI - 1; - final int jm1 = indJ - 1; - - // update the match array - double pBaseReadLog10 = 0.0; // Math.log10(1.0); - if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state - final byte x = readBases[im1-1]; - final byte y = haplotypeBases[jm1-1]; - final byte qual = readQuals[im1-1]; - pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); - } - final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); - final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); - final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); - matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); - - // update the X (insertion) array - final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); - final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); - - // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype - final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); - final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); - final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 - YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); - } - - // private function used by the banded approach to ensure the proposed bands are sufficiently distinct from each other - private boolean detectClusteredStartLocations( final ArrayList list, int loc ) { - for(int x : list) { - if( Math.abs(x-loc) <= BANDING_CLUSTER_WINDOW ) { - return true; - } - } - return false; - } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java index 15d34a348..4c54d4126 100644 --- a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java +++ b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java @@ -1,18 +1,42 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +import java.util.concurrent.TimeUnit; + /** - * A useful simple system for timing code. This code is not thread safe! + * A useful simple system for timing code with nano second resolution + * + * Note that this code is not thread-safe. If you have a single timer + * being started and stopped by multiple threads you will need to protect the + * calls to avoid meaningless results of having multiple starts and stops + * called sequentially. * * User: depristo * Date: Dec 10, 2010 * Time: 9:07:44 AM */ public class SimpleTimer { - final private String name; - private long elapsed = 0l; - private long startTime = 0l; - boolean running = false; + protected static final double NANO_TO_SECOND_DOUBLE = 1.0 / TimeUnit.SECONDS.toNanos(1); + private final String name; + + /** + * The elapsedTimeNano time in nanoSeconds of this timer. The elapsedTimeNano time is the + * sum of times between starts/restrats and stops. + */ + private long elapsedTimeNano = 0l; + + /** + * The start time of the last start/restart in nanoSeconds + */ + private long startTimeNano = 0l; + + /** + * Is this timer currently running (i.e., the last call was start/restart) + */ + private boolean running = false; /** * Creates an anonymous simple timer @@ -25,7 +49,8 @@ public class SimpleTimer { * Creates a simple timer named name * @param name of the timer, must not be null */ - public SimpleTimer(String name) { + public SimpleTimer(final String name) { + if ( name == null ) throw new IllegalArgumentException("SimpleTimer name cannot be null"); this.name = name; } @@ -37,27 +62,27 @@ public class SimpleTimer { } /** - * Starts the timer running, and sets the elapsed time to 0. This is equivalent to + * Starts the timer running, and sets the elapsedTimeNano time to 0. This is equivalent to * resetting the time to have no history at all. * * @return this object, for programming convenience */ + @Ensures("elapsedTimeNano == 0l") public synchronized SimpleTimer start() { - elapsed = 0l; - restart(); - return this; + elapsedTimeNano = 0l; + return restart(); } /** - * Starts the timer running, without reseting the elapsed time. This function may be + * Starts the timer running, without resetting the elapsedTimeNano time. This function may be * called without first calling start(). The only difference between start and restart - * is that start resets the elapsed time, while restart does not. + * is that start resets the elapsedTimeNano time, while restart does not. * * @return this object, for programming convenience */ public synchronized SimpleTimer restart() { running = true; - startTime = currentTime(); + startTimeNano = currentTimeNano(); return this; } @@ -71,29 +96,62 @@ public class SimpleTimer { /** * @return A convenience function to obtain the current time in milliseconds from this timer */ - public synchronized long currentTime() { + public long currentTime() { return System.currentTimeMillis(); } /** - * Stops the timer. Increases the elapsed time by difference between start and now. The - * timer must be running in order to call stop + * @return A convenience function to obtain the current time in nanoSeconds from this timer + */ + public long currentTimeNano() { + return System.nanoTime(); + } + + /** + * Stops the timer. Increases the elapsedTimeNano time by difference between start and now. + * + * It's ok to call stop on a timer that's not running. It has no effect on the timer. * * @return this object, for programming convenience */ + @Requires("startTimeNano != 0l") public synchronized SimpleTimer stop() { - running = false; - elapsed += currentTime() - startTime; + if ( running ) { + running = false; + elapsedTimeNano += currentTimeNano() - startTimeNano; + } return this; } /** - * Returns the total elapsed time of all start/stops of this timer. If the timer is currently + * Returns the total elapsedTimeNano time of all start/stops of this timer. If the timer is currently * running, includes the difference from currentTime() and the start as well * * @return this time, in seconds */ public synchronized double getElapsedTime() { - return (running ? (currentTime() - startTime + elapsed) : elapsed) / 1000.0; + return nanoToSecondsAsDouble(getElapsedTimeNano()); + } + + protected static double nanoToSecondsAsDouble(final long nano) { + return nano * NANO_TO_SECOND_DOUBLE; + } + + /** + * @see #getElapsedTime() but returns the result in nanoseconds + * + * @return the elapsed time in nanoseconds + */ + public synchronized long getElapsedTimeNano() { + return running ? (currentTimeNano() - startTimeNano + elapsedTimeNano) : elapsedTimeNano; + } + + /** + * Add the elapsed time from toAdd to this elapsed time + * + * @param toAdd the timer whose elapsed time we want to add to this timer + */ + public synchronized void addElapsed(final SimpleTimer toAdd) { + elapsedTimeNano += toAdd.getElapsedTimeNano(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index a5b5eca6a..f4a200af0 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -236,6 +236,33 @@ public class Utils { } } + public static List append(final List left, T ... elts) { + final List l = new LinkedList(left); + l.addAll(Arrays.asList(elts)); + return l; + } + + /** + * Returns a string of the values in joined by separator, such as A,B,C + * + * @param separator + * @param doubles + * @return + */ + public static String join(String separator, double[] doubles) { + if ( doubles == null || doubles.length == 0) + return ""; + else { + StringBuilder ret = new StringBuilder(); + ret.append(doubles[0]); + for (int i = 1; i < doubles.length; ++i) { + ret.append(separator); + ret.append(doubles[i]); + } + return ret.toString(); + } + } + /** * Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of * elti objects (note there's no actual space between sep and the elti elements). Returns @@ -810,4 +837,25 @@ public class Utils { return Collections.unmodifiableMap(map); } + /** + * Divides the input list into a list of sublists, which contains group size elements (except potentially the last one) + * + * list = [A, B, C, D, E] + * groupSize = 2 + * result = [[A, B], [C, D], [E]] + * + * @param list + * @param groupSize + * @return + */ + public static List> groupList(final List list, final int groupSize) { + if ( groupSize < 1 ) throw new IllegalArgumentException("groupSize >= 1"); + + final List> subLists = new LinkedList>(); + int n = list.size(); + for ( int i = 0; i < n; i += groupSize ) { + subLists.add(list.subList(i, Math.min(i + groupSize, n))); + } + return subLists; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index dab8ddc78..decc54d47 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -16,7 +16,7 @@ import java.util.ArrayList; * Date: 1/4/12 */ -public class ActiveRegion implements HasGenomeLocation, Comparable { +public class ActiveRegion implements HasGenomeLocation { private final ArrayList reads = new ArrayList(); private final GenomeLoc activeRegionLoc; @@ -78,11 +78,6 @@ public class ActiveRegion implements HasGenomeLocation, Comparable return reference; } - @Override - public int compareTo( final ActiveRegion other ) { - return this.getLocation().compareTo(other.getLocation()); - } - @Override public GenomeLoc getLocation() { return activeRegionLoc; } public GenomeLoc getExtendedLoc() { return extendedLoc; } @@ -102,4 +97,19 @@ public class ActiveRegion implements HasGenomeLocation, Comparable if ( extendedLoc.compareTo(other.extendedLoc) != 0 ) return false; return true; } + + /** + * A comparator class which is used to sort ActiveRegions by their start location + */ + /* + public static class ActiveRegionStartLocationComparator implements Comparator { + + public ActiveRegionStartLocationComparator() {} + + @Override + public int compare(final ActiveRegion left, final ActiveRegion right) { + return left.getLocation().compareTo(right.getLocation()); + } + } + */ } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index 439a0d8ed..cf4d699ee 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -52,13 +52,6 @@ public class BAQ { DONT_MODIFY // do the BAQ, but don't modify the quality scores themselves, just return them in the function. } - public enum ApplicationTime { - FORBIDDEN, // Walker does not tolerate BAQ input - ON_INPUT, // apply the BAQ calculation to the incoming reads, the default - ON_OUTPUT, // apply the BAQ calculation to outgoing read streams - HANDLED_IN_WALKER // the walker will deal with the BAQ calculation status itself - } - public static final String BAQ_TAG = "BQ"; private static double[] qual2prob = new double[256]; @@ -68,7 +61,7 @@ public class BAQ { } // Phred scaled now (changed 1/10/2011) - public static double DEFAULT_GOP = 40; + public static final double DEFAULT_GOP = 40; /* Takes a Phred Scale quality score and returns the error probability. * @@ -110,10 +103,19 @@ public class BAQ { * Use defaults for everything */ public BAQ() { - cd = convertFromPhredScale(DEFAULT_GOP); + this(DEFAULT_GOP); + } + + /** + * Use defaults for everything + */ + public BAQ(final double gapOpenPenalty) { + cd = convertFromPhredScale(gapOpenPenalty); initializeCachedData(); } + + /** * Create a new HmmGlocal object with specified parameters * diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java new file mode 100644 index 000000000..4589ffb71 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQReadTransformer.java @@ -0,0 +1,49 @@ +package org.broadinstitute.sting.utils.baq; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.BAQMode; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * Applies Heng's BAQ calculation to a stream of incoming reads + */ +public class BAQReadTransformer extends ReadTransformer { + private BAQ baqHMM; + private IndexedFastaSequenceFile refReader; + private BAQ.CalculationMode cmode; + private BAQ.QualityMode qmode; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + final BAQMode mode = WalkerManager.getWalkerAnnotation(walker, BAQMode.class); + this.refReader = engine.getReferenceDataSource().getReference(); + this.cmode = engine.getArguments().BAQMode; + this.qmode = mode.QualityMode(); + baqHMM = new BAQ(engine.getArguments().BAQGOP); + + if ( qmode == BAQ.QualityMode.DONT_MODIFY ) + throw new ReviewedStingException("BUG: shouldn't create BAQ transformer with quality mode DONT_MODIFY"); + + if ( mode.ApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN && enabled() ) + throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + cmode + " was requested."); + + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return cmode != BAQ.CalculationMode.OFF; + } + + @Override + public GATKSAMRecord apply(final GATKSAMRecord read) { + baqHMM.baqRead(read, refReader, cmode, qmode); + return read; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java deleted file mode 100644 index adfeef518..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQSamIterator.java +++ /dev/null @@ -1,59 +0,0 @@ -package org.broadinstitute.sting.utils.baq; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - -import java.util.Iterator; - -/** - * Simple iterator that applies Heng's BAQ calculation to a stream of incoming reads - */ -public class BAQSamIterator implements StingSAMIterator { - private final StingSAMIterator it; - private final BAQ baqHMM = new BAQ(); // creates a BAQ creator with default parameters - private final IndexedFastaSequenceFile refReader; - private final BAQ.CalculationMode cmode; - private final BAQ.QualityMode qmode; - - /** - * Creates a new BAMSamIterator using the reference getter refReader and applies the BAM to the reads coming - * in from it. See BAQ docs for baqType information. - * - * @param refReader - * @param it - * @param cmode - * @param qmode - */ - @Requires({ - "refReader != null", - "it != null", - "cmode != null" , - "qmode != null"}) - public BAQSamIterator(IndexedFastaSequenceFile refReader, StingSAMIterator it, BAQ.CalculationMode cmode, BAQ.QualityMode qmode) { - if ( cmode == BAQ.CalculationMode.OFF ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with calculation mode OFF"); - if ( qmode == BAQ.QualityMode.DONT_MODIFY ) throw new ReviewedStingException("BUG: shouldn't create BAQSamIterator with quailty mode DONT_MODIFY"); - - this.refReader = refReader; - this.it = it; - this.cmode = cmode; - this.qmode = qmode; - } - - @Requires("hasNext()") - @Ensures("result != null") - public SAMRecord next() { - //System.out.printf("BAQing during input%n"); - SAMRecord read = it.next(); - baqHMM.baqRead(read, refReader, cmode, qmode); - return read; - } - - public boolean hasNext() { return this.it.hasNext(); } - public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - public void close() { it.close(); } - public Iterator iterator() { return this; } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java new file mode 100644 index 000000000..18ab9e01a --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/baq/ReadTransformingIterator.java @@ -0,0 +1,44 @@ +package org.broadinstitute.sting.utils.baq; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.Iterator; + +/** + * Iterator that applies a ReadTransformer to a stream of reads + */ +public class ReadTransformingIterator implements StingSAMIterator { + private final StingSAMIterator it; + private final ReadTransformer transformer; + + /** + * Creates a new ReadTransforming iterator + */ + @Requires({"it != null", "transformer != null", "transformer.isInitialized()"}) + public ReadTransformingIterator(final StingSAMIterator it, final ReadTransformer transformer) { + if ( ! transformer.isInitialized() ) + throw new IllegalStateException("Creating a read transformer stream for an uninitialized read transformer: " + transformer); + if ( transformer.getApplicationTime() == ReadTransformer.ApplicationTime.FORBIDDEN ) + throw new IllegalStateException("Creating a read transformer stream for a forbidden transformer " + transformer); + + this.it = it; + this.transformer = transformer; + } + + @Requires("hasNext()") + @Ensures("result != null") + public SAMRecord next() { + final GATKSAMRecord read = (GATKSAMRecord)it.next(); + return transformer.apply(read); + } + + public boolean hasNext() { return this.it.hasNext(); } + public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } + public void close() { it.close(); } + public Iterator iterator() { return this; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java b/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java index dd12ce761..49851249c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/JVMUtils.java @@ -32,7 +32,6 @@ import org.reflections.util.ClasspathHelper; import java.io.File; import java.io.IOException; -import java.lang.annotation.Annotation; import java.lang.reflect.*; import java.net.URL; import java.util.*; @@ -198,7 +197,7 @@ public class JVMUtils { * @return the list of class path urls. */ public static Set getClasspathURLs() { - return ClasspathHelper.getUrlsForManifestsCurrentClasspath(); + return ClasspathHelper.forManifest(); } /** @@ -240,8 +239,8 @@ public class JVMUtils { /** * Returns a comma-separated list of the names of the interfaces implemented by this class * - * @param covClass - * @return + * @param covClass class + * @return names of interfaces */ public static String classInterfaces(final Class covClass) { final List interfaces = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java index 9a2cb68db..43cc800d8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/classloader/PluginManager.java @@ -25,15 +25,14 @@ package org.broadinstitute.sting.utils.classloader; -import ch.qos.logback.classic.Level; -import ch.qos.logback.classic.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.reflections.Reflections; import org.reflections.scanners.SubTypesScanner; import org.reflections.util.ConfigurationBuilder; -import org.slf4j.LoggerFactory; import java.io.File; import java.lang.reflect.Constructor; @@ -55,9 +54,8 @@ public class PluginManager { private static final Reflections defaultReflections; static { - // turn off logging in the reflections library - they talk too much (to the wrong logger factory as well, logback) - Logger logger = (ch.qos.logback.classic.Logger) LoggerFactory.getLogger(Reflections.class); - logger.setLevel(Level.OFF); + // turn off logging in the reflections library - they talk too much + Reflections.log = null; Set classPathUrls = new LinkedHashSet(); @@ -177,9 +175,9 @@ public class PluginManager { /** * Sorts, in place, the list of plugins according to getName() on each element * - * @param unsortedPlugins + * @param unsortedPlugins unsorted plugins */ - private final void sortPlugins(final List> unsortedPlugins) { + private void sortPlugins(final List> unsortedPlugins) { Collections.sort(unsortedPlugins, new ComparePluginsByName()); } @@ -233,7 +231,7 @@ public class PluginManager { * @param plugin Name of the plugin for which to search. * @return True if the plugin exists, false otherwise. */ - public boolean exists(Class plugin) { + public boolean exists(Class plugin) { return pluginsByName.containsValue(plugin); } @@ -276,8 +274,16 @@ public class PluginManager { */ public PluginType createByName(String pluginName) { Class plugin = pluginsByName.get(pluginName); - if( plugin == null ) - throw new UserException(String.format("Could not find %s with name: %s", pluginCategory,pluginName)); + if( plugin == null ) { + String errorMessage = formatErrorMessage(pluginCategory,pluginName); + if ( this.getClass().isAssignableFrom(FilterManager.class) ) { + throw new UserException.MalformedReadFilterException(errorMessage); + } else if ( this.getClass().isAssignableFrom(WalkerManager.class) ) { + throw new UserException.MalformedWalkerArgumentsException(errorMessage); + } else { + throw new UserException.CommandLineException(errorMessage); + } + } try { return plugin.newInstance(); } catch (Exception e) { @@ -330,4 +336,14 @@ public class PluginManager { return pluginName; } + + /** + * Generate the error message for the plugin manager. The message is allowed to depend on the class. + * @param pluginCategory - string, the category of the plugin (e.g. read filter) + * @param pluginName - string, what we were trying to match (but failed to) + * @return error message text describing the error + */ + protected String formatErrorMessage(String pluginCategory, String pluginName ) { + return String.format("Could not find %s with name: %s", pluginCategory,pluginName); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index 08c50b982..98eb582e8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -18,7 +18,7 @@ import java.util.Vector; * of the read, plus an option extraInfo (useful for carrying info where needed). *

    * Also holds the critical apply function that actually execute the clipping operation on a provided read, - * according to the wishes of the supplid ClippingAlgorithm enum. + * according to the wishes of the supplied ClippingAlgorithm enum. */ public class ClippingOp { public final int start, stop; // inclusive @@ -37,34 +37,60 @@ public class ClippingOp { * Clips the bases in read according to this operation's start and stop. Uses the clipping * representation used is the one provided by algorithm argument. * - * @param algorithm - * @param read + * @param algorithm clipping algorithm to use + * @param originalRead the read to be clipped */ - public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord read) { + public GATKSAMRecord apply(ClippingRepresentation algorithm, GATKSAMRecord originalRead) { + GATKSAMRecord read; + try { + read = (GATKSAMRecord) originalRead.clone(); + } catch (CloneNotSupportedException e) { + throw new ReviewedStingException("Where did the clone go?"); + } byte[] quals = read.getBaseQualities(); byte[] bases = read.getReadBases(); + byte[] newBases = new byte[bases.length]; + byte[] newQuals = new byte[quals.length]; switch (algorithm) { // important note: // it's not safe to call read.getReadBases()[i] = 'N' or read.getBaseQualities()[i] = 0 // because you're not guaranteed to get a pointer to the actual array of bytes in the GATKSAMRecord case WRITE_NS: - for (int i = start; i <= stop; i++) - bases[i] = 'N'; - read.setReadBases(bases); + for (int i = 0; i < bases.length; i++) { + if (i >= start && i <= stop) { + newBases[i] = 'N'; + } + else { + newBases[i] = bases[i]; + } + } + read.setReadBases(newBases); break; case WRITE_Q0S: - for (int i = start; i <= stop; i++) - quals[i] = 0; - read.setBaseQualities(quals); + for (int i = 0; i < quals.length; i++) { + if (i >= start && i <= stop) { + newQuals[i] = 0; + } + else { + newQuals[i] = quals[i]; + } + } + read.setBaseQualities(newQuals); break; case WRITE_NS_Q0S: - for (int i = start; i <= stop; i++) { - bases[i] = 'N'; - quals[i] = 0; + for (int i = 0; i < bases.length; i++) { + if (i >= start && i <= stop) { + newQuals[i] = 0; + newBases[i] = 'N'; + } + else { + newQuals[i] = quals[i]; + newBases[i] = bases[i]; + } } - read.setReadBases(bases); - read.setBaseQualities(quals); + read.setBaseQualities(newBases); + read.setReadBases(newBases); break; case HARDCLIP_BASES: read = hardClip(read, start, stop); @@ -437,8 +463,8 @@ public class ClippingOp { * Checks if a hard clipped cigar left a read starting or ending with insertions/deletions * and cleans it up accordingly. * - * @param cigar - * @return + * @param cigar the original cigar + * @return an object with the shifts (see CigarShift class) */ private CigarShift cleanHardClippedCigar(Cigar cigar) { Cigar cleanCigar = new Cigar(); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java index 916fb43ea..6b3fce966 100644 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/hapmap/RawHapMapCodec.java @@ -25,8 +25,11 @@ package org.broadinstitute.sting.utils.codecs.hapmap; import org.broad.tribble.AsciiFeatureCodec; +import org.broad.tribble.FeatureCodecHeader; import org.broad.tribble.annotation.Strand; +import org.broad.tribble.readers.AsciiLineReader; import org.broad.tribble.readers.LineReader; +import org.broad.tribble.readers.PositionalBufferedStream; import java.io.IOException; import java.util.Arrays; @@ -116,4 +119,10 @@ public class RawHapMapCodec extends AsciiFeatureCodec { } return headerLine; } + + @Override + public FeatureCodecHeader readHeader(final PositionalBufferedStream stream) throws IOException { + final AsciiLineReader br = new AsciiLineReader(stream); + return new FeatureCodecHeader(readHeader(br), br.getPosition()); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java index 4df1efee7..f12f13dc7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCodec.java @@ -2,8 +2,6 @@ package org.broadinstitute.sting.utils.codecs.vcf; import org.broad.tribble.TribbleException; import org.broad.tribble.readers.LineReader; -import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.variantcontext.*; import java.io.IOException; import java.util.*; @@ -119,7 +117,7 @@ public class VCFCodec extends AbstractVCFCodec { // empty set for passes filters List fFields = new LinkedList(); // otherwise we have to parse and cache the value - if ( filterString.indexOf(VCFConstants.FILTER_CODE_SEPARATOR) == -1 ) + if ( !filterString.contains(VCFConstants.FILTER_CODE_SEPARATOR) ) fFields.add(filterString); else fFields.addAll(Arrays.asList(filterString.split(VCFConstants.FILTER_CODE_SEPARATOR))); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java index 667de3dea..5273806a7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFCompoundHeaderLine.java @@ -88,8 +88,8 @@ public abstract class VCFCompoundHeaderLine extends VCFHeaderLine implements VCF case UNBOUNDED: return -1; case A: return vc.getNAlleles() - 1; case G: - final int ploidy = vc.getMaxPloidy(); - return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy == 0 ? 2 : ploidy); + final int ploidy = vc.getMaxPloidy(2); + return GenotypeLikelihoods.numLikelihoods(vc.getNAlleles(), ploidy); default: throw new ReviewedStingException("Unknown count type: " + countType); } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java index 2663e848f..44a3e9af3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFHeader.java @@ -159,6 +159,7 @@ public class VCFHeader { */ public void addMetaDataLine(VCFHeaderLine headerLine) { mMetaData.add(headerLine); + loadMetaDataMaps(); } /** @@ -236,7 +237,6 @@ public class VCFHeader { + VCFConstants.GENOTYPE_PL_KEY + " field. As the GATK now only manages PL fields internally" + " automatically adding a corresponding PL field to your VCF header"); addMetaDataLine(new VCFFormatHeaderLine(VCFConstants.GENOTYPE_PL_KEY, VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification")); - loadMetaDataMaps(); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java new file mode 100644 index 000000000..63927ac84 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/collections/LoggingNestedIntegerArray.java @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.collections; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.io.PrintStream; + +/** + * Wrapper around the basic NestedIntegerArray class that logs all updates (ie., all calls to put()) + * to the provided output stream. For testing/debugging purposes. + * + * Log entries are of the following form (fields are tab-separated): + * LABEL OPERATION VALUE KEY1 KEY2 ... KEY_N + * + * A header line is written before the log entries giving the dimensions of this NestedIntegerArray. + * It has the form: + * + * # LABEL SIZE_OF_FIRST_DIMENSION SIZE_OF_SECOND_DIMENSION ... SIZE_OF_NTH_DIMENSION + * + * @author David Roazen + */ +public class LoggingNestedIntegerArray extends NestedIntegerArray { + + private PrintStream log; + private String logEntryLabel; + + public static final String HEADER_LINE_PREFIX = "# "; + public enum NestedIntegerArrayOperation { GET, PUT }; + + /** + * + * @param log output stream to which to log update operations + * @param logEntryLabel String that should be prefixed to each log entry + * @param dimensions + */ + public LoggingNestedIntegerArray( PrintStream log, String logEntryLabel, final int... dimensions ) { + super(dimensions); + + if ( log == null ) { + throw new ReviewedStingException("Log output stream must not be null"); + } + this.log = log; + this.logEntryLabel = logEntryLabel != null ? logEntryLabel : ""; + + // Write the header line recording the dimensions of this NestedIntegerArray: + StringBuilder logHeaderLine = new StringBuilder(); + + logHeaderLine.append(HEADER_LINE_PREFIX); + logHeaderLine.append(this.logEntryLabel); + for ( int dimension : dimensions ) { + logHeaderLine.append("\t"); + logHeaderLine.append(dimension); + } + + this.log.println(logHeaderLine.toString()); + } + + @Override + public T get( final int... keys ) { + StringBuilder logEntry = new StringBuilder(); + + logEntry.append(logEntryLabel); + logEntry.append("\t"); + logEntry.append(NestedIntegerArrayOperation.GET); + logEntry.append("\t"); // empty field for the datum value + + for ( int key : keys ) { + logEntry.append("\t"); + logEntry.append(key); + } + + log.println(logEntry.toString()); + + return super.get(keys); + } + + @Override + public boolean put( final T value, final int... keys ) { + StringBuilder logEntry = new StringBuilder(); + + logEntry.append(logEntryLabel); + logEntry.append("\t"); + logEntry.append(NestedIntegerArrayOperation.PUT); + logEntry.append("\t"); + logEntry.append(value); + for ( int key : keys ) { + logEntry.append("\t"); + logEntry.append(key); + } + + // PrintStream methods all use synchronized blocks internally, so our logging is thread-safe + log.println(logEntry.toString()); + + return super.put(value, keys); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java index 31d316555..050ed52ac 100755 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/NestedIntegerArray.java @@ -25,9 +25,11 @@ package org.broadinstitute.sting.utils.collections; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -38,18 +40,51 @@ import java.util.List; public class NestedIntegerArray { + private static Logger logger = Logger.getLogger(NestedIntegerArray.class); + protected final Object[] data; protected final int numDimensions; protected final int[] dimensions; + // Preallocate the first two dimensions to limit contention during tree traversals in put() + private static final int NUM_DIMENSIONS_TO_PREALLOCATE = 2; + public NestedIntegerArray(final int... dimensions) { numDimensions = dimensions.length; if ( numDimensions == 0 ) throw new ReviewedStingException("There must be at least one dimension to an NestedIntegerArray"); this.dimensions = dimensions.clone(); + int dimensionsToPreallocate = Math.min(dimensions.length, NUM_DIMENSIONS_TO_PREALLOCATE); + + logger.info(String.format("Creating NestedIntegerArray with dimensions %s", Arrays.toString(dimensions))); + logger.info(String.format("Pre-allocating first %d dimensions", dimensionsToPreallocate)); + data = new Object[dimensions[0]]; + preallocateArray(data, 0, dimensionsToPreallocate); + + logger.info(String.format("Done pre-allocating first %d dimensions", dimensionsToPreallocate)); + } + + /** + * Recursively allocate the first dimensionsToPreallocate dimensions of the tree + * + * Pre-allocating the first few dimensions helps limit contention during tree traversals in put() + * + * @param subarray current node in the tree + * @param dimension current level in the tree + * @param dimensionsToPreallocate preallocate only this many dimensions (starting from the first) + */ + private void preallocateArray( Object[] subarray, int dimension, int dimensionsToPreallocate ) { + if ( dimension >= dimensionsToPreallocate - 1 ) { + return; + } + + for ( int i = 0; i < subarray.length; i++ ) { + subarray[i] = new Object[dimensions[dimension + 1]]; + preallocateArray((Object[])subarray[i], dimension + 1, dimensionsToPreallocate); + } } public T get(final int... keys) { @@ -59,14 +94,30 @@ public class NestedIntegerArray { for( int i = 0; i < numNestedDimensions; i++ ) { if ( keys[i] >= dimensions[i] ) return null; + myData = (Object[])myData[keys[i]]; if ( myData == null ) return null; } + return (T)myData[keys[numNestedDimensions]]; } - public synchronized void put(final T value, final int... keys) { // WARNING! value comes before the keys! + /** + * Insert a value at the position specified by the given keys. + * + * This method is thread-safe, however the caller MUST check the + * return value to see if the put succeeded. This method RETURNS FALSE if + * the value could not be inserted because there already was a value present + * at the specified location. In this case the caller should do a get() to get + * the already-existing value and (potentially) update it. + * + * @param value value to insert + * @param keys keys specifying the location of the value in the tree + * @return true if the value was inserted, false if it could not be inserted because there was already + * a value at the specified position + */ + public boolean put(final T value, final int... keys) { // WARNING! value comes before the keys! if ( keys.length != numDimensions ) throw new ReviewedStingException("Exactly " + numDimensions + " keys should be passed to this NestedIntegerArray but " + keys.length + " were provided"); @@ -75,15 +126,35 @@ public class NestedIntegerArray { for ( int i = 0; i < numNestedDimensions; i++ ) { if ( keys[i] >= dimensions[i] ) throw new ReviewedStingException("Key " + keys[i] + " is too large for dimension " + i + " (max is " + (dimensions[i]-1) + ")"); - Object[] temp = (Object[])myData[keys[i]]; - if ( temp == null ) { - temp = new Object[dimensions[i+1]]; - myData[keys[i]] = temp; + + // If we're at or beyond the last dimension that was pre-allocated, we need to do a synchronized + // check to see if the next branch exists, and if it doesn't, create it + if ( i >= NUM_DIMENSIONS_TO_PREALLOCATE - 1 ) { + synchronized ( myData ) { + if ( myData[keys[i]] == null ) { + myData[keys[i]] = new Object[dimensions[i + 1]]; + } + } } - myData = temp; + + myData = (Object[])myData[keys[i]]; } - myData[keys[numNestedDimensions]] = value; + synchronized ( myData ) { // lock the bottom row while we examine and (potentially) update it + + // Insert the new value only if there still isn't any existing value in this position + if ( myData[keys[numNestedDimensions]] == null ) { + myData[keys[numNestedDimensions]] = value; + } + else { + // Already have a value for this leaf (perhaps another thread came along and inserted one + // while we traversed the tree), so return false to notify the caller that we didn't put + // the item + return false; + } + } + + return true; } public List getAllValues() { diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 3130469e5..a49a12292 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -63,6 +63,18 @@ public class UserException extends ReviewedStingException { } } + public static class MalformedReadFilterException extends CommandLineException { + public MalformedReadFilterException(String message) { + super(String.format("Malformed read filter: %s",message)); + } + } + + public static class MalformedWalkerArgumentsException extends CommandLineException { + public MalformedWalkerArgumentsException(String message) { + super(String.format("Malformed walker argument: %s",message)); + } + } + public static class MalformedGenomeLoc extends UserException { public MalformedGenomeLoc(String message, GenomeLoc loc) { super(String.format("Badly formed genome loc: %s: %s", message, loc)); @@ -117,6 +129,12 @@ public class UserException extends ReviewedStingException { } } + public static class LocalParallelizationProblem extends UserException { + public LocalParallelizationProblem(final File file) { + super(String.format("There was a failure because temporary file %s could not be found while running the GATK with more than one thread. Possible causes for this problem include: your system's open file handle limit is too small, your output or temp directories do not have sufficient space, or just an isolated file system blip", file.getAbsolutePath())); + } + } + public static class NotEnoughMemory extends UserException { public NotEnoughMemory() { super(String.format("There was a failure because you did not provide enough memory to run this program. See the -Xmx JVM argument to adjust the maximum heap size provided to Java")); @@ -129,6 +147,12 @@ public class UserException extends ReviewedStingException { } } + public static class NoSpaceOnDevice extends UserException { + public NoSpaceOnDevice() { + super("There is no space left on the device, so writing failed"); + } + } + public static class CouldNotReadInputFile extends UserException { public CouldNotReadInputFile(String message, Exception e) { super(String.format("Couldn't read file because %s caused by %s", message, getMessage(e))); @@ -142,6 +166,10 @@ public class UserException extends ReviewedStingException { super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message)); } + public CouldNotReadInputFile(String file, String message) { + super(String.format("Couldn't read file %s because %s", file, message)); + } + public CouldNotReadInputFile(File file, String message, Exception e) { super(String.format("Couldn't read file %s because %s with exception %s", file.getAbsolutePath(), message, getMessage(e))); } @@ -255,6 +283,12 @@ public class UserException extends ReviewedStingException { } } + public static class FailsStrictValidation extends UserException { + public FailsStrictValidation(File f, String message) { + super(String.format("File %s fails strict validation: %s", f.getAbsolutePath(), message)); + } + } + public static class MalformedFile extends UserException { public MalformedFile(String message) { super(String.format("Unknown file is malformed: %s", message)); @@ -324,6 +358,9 @@ public class UserException extends ReviewedStingException { } public static class CannotExecuteQScript extends UserException { + public CannotExecuteQScript(String message) { + super(String.format("Unable to execute QScript: " + message)); + } public CannotExecuteQScript(String message, Exception e) { super(String.format("Unable to execute QScript: " + message), e); } diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index 48706543a..db54851dd 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -29,6 +29,7 @@ import net.sf.picard.reference.FastaSequenceIndex; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.File; @@ -38,14 +39,11 @@ import java.util.Arrays; /** * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. * - * Thread-safe! Uses a lock object to protect write and access to the cache. + * Thread-safe! Uses a thread-local cache */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); - /** global enable flag */ - private static final boolean USE_CACHE = true; - /** do we want to print debugging information about cache efficiency? */ private static final boolean PRINT_EFFICIENCY = false; @@ -53,31 +51,29 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { private static final int PRINT_FREQUENCY = 10000; /** The default cache size in bp */ - private static final long DEFAULT_CACHE_SIZE = 1000000; + public static final long DEFAULT_CACHE_SIZE = 1000000; + + /** The cache size of this CachingIndexedFastaSequenceFile */ + final long cacheSize; + + /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ + final long cacheMissBackup; // information about checking efficiency long cacheHits = 0; long cacheMisses = 0; - /** The cache size of this CachingIndexedFastaSequenceFile */ - long cacheSize = DEFAULT_CACHE_SIZE; - - /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ - long cacheMissBackup = 100; - /** Represents a specific cached sequence, with a specific start and stop, as well as the bases */ private static class Cache { long start = -1, stop = -1; ReferenceSequence seq = null; } + /** + * Thread local cache to allow multi-threaded use of this class + */ private ThreadLocal cache; - { - resetThreadLocalCache(); - } - - protected void resetThreadLocalCache() { cache = new ThreadLocal () { @Override protected Cache initialValue() { return new Cache(); @@ -87,76 +83,107 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { /** * Same as general constructor but allows one to override the default cacheSize - * @param file + * + * @param fasta * @param index * @param cacheSize */ - public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index, long cacheSize) { - super(file, index); - setCacheSize(cacheSize); - } - - private void setCacheSize(long cacheSize) { + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { + super(fasta, index); + if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); } /** * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * @param file The file to open. + * + * @param fasta The file to open. * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. */ - public CachingIndexedFastaSequenceFile(final File file, final FastaSequenceIndex index) { - this(file, index, DEFAULT_CACHE_SIZE); + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { + this(fasta, index, DEFAULT_CACHE_SIZE); } /** * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * @param file The file to open. + * + * Looks for a index file for fasta on disk + * + * @param fasta The file to open. */ - public CachingIndexedFastaSequenceFile(final File file) throws FileNotFoundException { - this(file, DEFAULT_CACHE_SIZE); + public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { + this(fasta, DEFAULT_CACHE_SIZE); } - public CachingIndexedFastaSequenceFile(final File file, long cacheSize ) throws FileNotFoundException { - super(file); - setCacheSize(cacheSize); + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * Uses provided cacheSize instead of the default + * + * @param fasta The file to open. + */ + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { + super(fasta); + if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); + this.cacheSize = cacheSize; + this.cacheMissBackup = Math.max(cacheSize / 1000, 1); } - public void printEfficiency() { - // comment out to disable tracking - if ( (cacheHits + cacheMisses) % PRINT_FREQUENCY == 0 ) { - logger.info(String.format("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%%n", cacheHits, cacheMisses, calcEfficiency())); - } + /** + * Print the efficiency (hits / queries) to logger with priority + */ + public void printEfficiency(final Priority priority) { + logger.log(priority, String.format("### CachingIndexedFastaReader: hits=%d misses=%d efficiency %.6f%%", cacheHits, cacheMisses, calcEfficiency())); } + /** + * Returns the efficiency (% of hits of all queries) of this object + * @return + */ public double calcEfficiency() { return 100.0 * cacheHits / (cacheMisses + cacheHits * 1.0); } + /** + * @return the number of cache hits that have occurred + */ public long getCacheHits() { return cacheHits; } + /** + * @return the number of cache misses that have occurred + */ public long getCacheMisses() { return cacheMisses; } + /** + * @return the size of the cache we are using + */ + public long getCacheSize() { + return cacheSize; + } /** * Gets the subsequence of the contig in the range [start,stop] + * + * Uses the sequence cache if possible, or updates the cache to handle the request. If the range + * is larger than the cache itself, just loads the sequence directly, not changing the cache at all + * * @param contig Contig whose subsequence to retrieve. * @param start inclusive, 1-based start of region. * @param stop inclusive, 1-based stop of region. * @return The partial reference sequence associated with this range. */ - public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) { - ReferenceSequence result; - Cache myCache = cache.get(); - //System.out.printf("getSubsequentAt cache=%s%n", myCache); + public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) { + final ReferenceSequence result; + final Cache myCache = cache.get(); - if ( ! USE_CACHE || (stop - start) >= cacheSize ) { + if ( (stop - start) >= cacheSize ) { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); } else { @@ -177,8 +204,8 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { } // at this point we determine where in the cache we want to extract the requested subsequence - int cacheOffsetStart = (int)(start - myCache.start); - int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1); + final int cacheOffsetStart = (int)(start - myCache.start); + final int cacheOffsetStop = (int)(stop - start + cacheOffsetStart + 1); try { result = new ReferenceSequence(myCache.seq.getName(), myCache.seq.getContigIndex(), Arrays.copyOfRange(myCache.seq.getBases(), cacheOffsetStart, cacheOffsetStop)); @@ -188,12 +215,8 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { } } -// // comment out to disable testing -// ReferenceSequence verify = super.getSubsequenceAt(contig, start, stop); -// if ( ! Arrays.equals(verify.getBases(), result.getBases()) ) -// throw new ReviewedStingException(String.format("BUG: cached reference sequence not the same as clean fetched version at %s %d %d", contig, start, stop)); - - if ( PRINT_EFFICIENCY ) printEfficiency(); + if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 ) + printEfficiency(Priority.INFO); return result; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 2f31c154c..a4a5d578a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -128,22 +128,13 @@ public class FragmentUtils { return create(reads, reads.size(), SamRecordGetter); } - public final static List mergeOverlappingPairedFragments( List overlappingPair ) { + public final static List mergeOverlappingPairedFragments( final List overlappingPair ) { final byte MIN_QUAL_BAD_OVERLAP = 16; if( overlappingPair.size() != 2 ) { throw new ReviewedStingException("Found overlapping pair with " + overlappingPair.size() + " reads, but expecting exactly 2."); } GATKSAMRecord firstRead = overlappingPair.get(0); GATKSAMRecord secondRead = overlappingPair.get(1); - /* - System.out.println("read 0 unclipped start:"+overlappingPair.get(0).getUnclippedStart()); - System.out.println("read 0 unclipped end:"+overlappingPair.get(0).getUnclippedEnd()); - System.out.println("read 1 unclipped start:"+overlappingPair.get(1).getUnclippedStart()); - System.out.println("read 1 unclipped end:"+overlappingPair.get(1).getUnclippedEnd()); - System.out.println("read 0 start:"+overlappingPair.get(0).getAlignmentStart()); - System.out.println("read 0 end:"+overlappingPair.get(0).getAlignmentEnd()); - System.out.println("read 1 start:"+overlappingPair.get(1).getAlignmentStart()); - System.out.println("read 1 end:"+overlappingPair.get(1).getAlignmentEnd()); - */ + if( !(secondRead.getSoftStart() <= firstRead.getSoftEnd() && secondRead.getSoftStart() >= firstRead.getSoftStart() && secondRead.getSoftEnd() >= firstRead.getSoftEnd()) ) { firstRead = overlappingPair.get(1); // swap them secondRead = overlappingPair.get(0); @@ -155,15 +146,6 @@ public class FragmentUtils { return overlappingPair; // fragments contain indels so don't merge them } -/* // check for inconsistent start positions between uncliped/soft alignment starts - if (secondRead.getAlignmentStart() >= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() < firstRead.getUnclippedStart()) - return overlappingPair; - if (secondRead.getAlignmentStart() <= firstRead.getAlignmentStart() && secondRead.getUnclippedStart() > firstRead.getUnclippedStart()) - return overlappingPair; - - if (secondRead.getUnclippedStart() < firstRead.getAlignmentEnd() && secondRead.getAlignmentStart() >= firstRead.getAlignmentEnd()) - return overlappingPair; - */ final Pair pair = ReadUtils.getReadCoordinateForReferenceCoordinate(firstRead, secondRead.getSoftStart()); final int firstReadStop = ( pair.getSecond() ? pair.getFirst() + 1 : pair.getFirst() ); @@ -183,7 +165,7 @@ public class FragmentUtils { } for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { - return overlappingPair;// high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them + return overlappingPair; // high qual bases don't match exactly, probably indel in only one of the fragments, so don't merge them } if( firstReadQuals[iii] < MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] < MIN_QUAL_BAD_OVERLAP ) { return overlappingPair; // both reads have low qual bases in the overlap region so don't merge them because don't know what is going on @@ -197,7 +179,7 @@ public class FragmentUtils { } final GATKSAMRecord returnRead = new GATKSAMRecord( firstRead.getHeader() ); - returnRead.setAlignmentStart( firstRead.getUnclippedStart() ); + returnRead.setAlignmentStart( firstRead.getSoftStart() ); returnRead.setReadBases( bases ); returnRead.setBaseQualities( quals ); returnRead.setReadGroup( firstRead.getReadGroup() ); diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java new file mode 100644 index 000000000..22d249240 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/PerReadAlleleLikelihoodMap.java @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.genotyper; + + +import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.io.PrintStream; +import java.lang.reflect.Constructor; +import java.util.*; + +public abstract class PerReadAlleleLikelihoodMap { + + public static final double INDEL_LIKELIHOOD_THRESH = 0.1; + + protected List alleles; + protected Map> likelihoodReadMap; + + public abstract void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log); + public abstract ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log); + + public void add(GATKSAMRecord read, Allele a, Double likelihood) { + Map likelihoodMap; + if (likelihoodReadMap.containsKey(read)){ + // seen pileup element before + likelihoodMap = likelihoodReadMap.get(read); + } + else { + likelihoodMap = new HashMap(); + likelihoodReadMap.put(read,likelihoodMap); + } + likelihoodMap.put(a,likelihood); + + if (!alleles.contains(a)) + alleles.add(a); + + } + + public int size() { + return likelihoodReadMap.size(); + } + + public void add(PileupElement p, Allele a, Double likelihood) { + add(p.getRead(),a,likelihood); + } + + public boolean containsPileupElement(PileupElement p) { + return likelihoodReadMap.containsKey(p.getRead()); + } + + public boolean isEmpty() { + return likelihoodReadMap.isEmpty(); + } + + public Map> getLikelihoodReadMap() { + return likelihoodReadMap; + } + public void clear() { + alleles.clear(); + likelihoodReadMap.clear(); + } + + public Set getStoredElements() { + return likelihoodReadMap.keySet(); + } + + public Collection> getLikelihoodMapValues() { + return likelihoodReadMap.values(); + } + + public int getNumberOfStoredElements() { + return likelihoodReadMap.size(); + } + + public Map getLikelihoodsAssociatedWithPileupElement(PileupElement p) { + if (!likelihoodReadMap.containsKey(p.getRead())) + return null; + + return likelihoodReadMap.get(p.getRead()); + } + + public static Allele getMostLikelyAllele( final Map alleleMap ) { + double maxLike = Double.NEGATIVE_INFINITY; + double prevMaxLike = Double.NEGATIVE_INFINITY; + Allele mostLikelyAllele = Allele.NO_CALL; + + for (final Map.Entry el : alleleMap.entrySet()) { + if (el.getValue() > maxLike) { + prevMaxLike = maxLike; + maxLike = el.getValue(); + mostLikelyAllele = el.getKey(); + } else if( el.getValue() > prevMaxLike ) { + prevMaxLike = el.getValue(); + } + } + return (maxLike - prevMaxLike > INDEL_LIKELIHOOD_THRESH ? mostLikelyAllele : Allele.NO_CALL ); + } + + public static PerReadAlleleLikelihoodMap getBestAvailablePerReadAlleleLikelihoodMap() { + final Class PerReadAlleleLikelihoodMapClass = GATKLiteUtils.getProtectedClassIfAvailable(PerReadAlleleLikelihoodMap.class); + try { + Constructor constructor = PerReadAlleleLikelihoodMapClass.getDeclaredConstructor((Class[])null); + constructor.setAccessible(true); + return (PerReadAlleleLikelihoodMap)constructor.newInstance(); + } + catch (Exception e) { + throw new ReviewedStingException("Unable to create RecalibrationEngine class instance " + PerReadAlleleLikelihoodMapClass.getSimpleName()); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/StandardPerReadAlleleLikelihoodMap.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/StandardPerReadAlleleLikelihoodMap.java new file mode 100644 index 000000000..7db818592 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/StandardPerReadAlleleLikelihoodMap.java @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2011 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.genotyper; + + +import org.broadinstitute.sting.utils.classloader.PublicPackageSource; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.broadinstitute.sting.utils.variantcontext.Allele; + +import java.io.PrintStream; +import java.util.*; + +public class StandardPerReadAlleleLikelihoodMap extends PerReadAlleleLikelihoodMap implements PublicPackageSource { + + public StandardPerReadAlleleLikelihoodMap() { + likelihoodReadMap = new LinkedHashMap>(); + alleles = new ArrayList(); + } + + // not implemented in the standard version + public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {} + public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) { return pileup; } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java index 1dfc4ecc0..fe5f48a48 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java @@ -44,7 +44,7 @@ public class ForumAPIUtils { /** * How we post to the forum */ - private final static String API_URL = "https://gatk.vanillaforums.com/"; + private final static String API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; final private static String ACCESS_TOKEN = "access_token="; public static List getPostedTools(String forumKey) { diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java index 25ef8ccd2..0f6808718 100755 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpFormatter.java @@ -26,10 +26,7 @@ package org.broadinstitute.sting.utils.help; import org.apache.log4j.Logger; -import org.broadinstitute.sting.commandline.ArgumentDefinition; -import org.broadinstitute.sting.commandline.ArgumentDefinitionGroup; -import org.broadinstitute.sting.commandline.ArgumentDefinitions; -import org.broadinstitute.sting.commandline.ArgumentMatchSource; +import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.text.TextFormattingUtils; @@ -273,9 +270,9 @@ public class HelpFormatter { * Generate a standard header for the logger * * @param applicationDetails details of the application to run. - * @param parsedArgs the command line arguments passed in + * @param parsedArgs the arguments passed in */ - public static void generateHeaderInformation(ApplicationDetails applicationDetails, Map> parsedArgs) { + public static void generateHeaderInformation(ApplicationDetails applicationDetails, Map parsedArgs) { DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); java.util.Date date = new java.util.Date(); @@ -286,19 +283,16 @@ public class HelpFormatter { for (String headerLine : applicationDetails.applicationHeader) logger.info(headerLine); logger.debug("Current directory: " + System.getProperty("user.dir")); - for (Map.Entry> entry: parsedArgs.entrySet()) { + for (Map.Entry entry: parsedArgs.entrySet()) { ArgumentMatchSource matchSource = entry.getKey(); final String sourceName; switch (matchSource.getType()) { case CommandLine: sourceName = "Program"; break; - case File: sourceName = matchSource.getFile().getPath(); break; + case Provider: sourceName = matchSource.getDescription(); break; default: throw new RuntimeException("Unexpected argument match source type: " + matchSource.getType()); } - String output = sourceName + " Args:"; - for (String str : entry.getValue()) { - output = output + " " + str; - } + String output = sourceName + " Args: " + entry.getValue().getDescription(); logger.info(output); } logger.info("Date/Time: " + dateFormat.format(date)); diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java index 160df0e51..b79211e74 100644 --- a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java @@ -48,14 +48,23 @@ public class IOUtils { * @param tempDir Temporary directory. */ public static void checkTempDir(File tempDir) { + if (isDefaultTempDir(tempDir)) + throw new UserException.BadTmpDir("java.io.tmpdir must be explicitly set"); + if (!tempDir.exists() && !tempDir.mkdirs()) + throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath()); + } + + /** + * Returns true if the directory is a default temporary directory. + * @param tempDir the directory to check. + * @return true if the directory is a default temporary directory. + */ + public static boolean isDefaultTempDir(File tempDir) { String tempDirPath = tempDir.getAbsolutePath(); // Keeps the user from leaving the temp directory as the default, and on Macs from having pluses // in the path which can cause problems with the Google Reflections library. // see also: http://benjchristensen.com/2009/09/22/mac-osx-10-6-java-java-io-tmpdir/ - if (tempDirPath.startsWith("/var/folders/") || (tempDirPath.equals("/tmp")) || (tempDirPath.equals("/tmp/"))) - throw new UserException.BadTmpDir("java.io.tmpdir must be explicitly set"); - if (!tempDir.exists() && !tempDir.mkdirs()) - throw new UserException.BadTmpDir("Could not create directory: " + tempDir.getAbsolutePath()); + return (tempDirPath.startsWith("/var/folders/") || (tempDirPath.equals("/tmp")) || (tempDirPath.equals("/tmp/"))); } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java new file mode 100644 index 000000000..d0ad51cb0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/EOFMarkedValue.java @@ -0,0 +1,80 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Wrapper to hold data that distinguishing an special EOF marker from a real object + * + * The only way to tell in a consumer thread that a blocking queue has no more data ever + * coming down the pipe is to pass in a "poison" or EOF object. This class provides + * a generic capacity for that... + * + * The use case looks like this: + * + * BlockingQueue q + * producer: + * while ( x has items ) + * q.put(new EOFMarkedValue(x)) + * q.put(new EOFMarkedValue()) + * + * Consumer: + * while ( true ) + * value = q.take() + * if ( value.isEOFMarker() ) + * break + * else + * do something useful with value + * + * + * User: depristo + * Date: 9/6/12 + * Time: 3:08 PM + */ +//@Invariant("! isEOFMarker() || value == null") +class EOFMarkedValue { + /** + * True if this is the EOF marker object + */ + final private boolean isLast; + + /** + * Our value, if we aren't the EOF marker + */ + final private T value; + + /** + * Create a new EOFMarkedValue containing a real value, where last is false + * @param value + */ + EOFMarkedValue(final T value) { + isLast = false; + this.value = value; + } + + /** + * Create a new EOFMarkedValue that is the last item + */ + EOFMarkedValue() { + isLast = true; + this.value = null; + } + + /** + * Is this the EOF marker? + * + * @return true if so, else false + */ + public boolean isEOFMarker() { + return isLast; + } + + /** + * Get the value held by this EOFMarkedValue + * + * @return the value + * @throws IllegalStateException if this is the last item + */ + public T getValue() { + if ( isEOFMarker() ) + throw new IllegalStateException("Cannot get value for last object"); + return value; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java new file mode 100644 index 000000000..bd99a9266 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/InputProducer.java @@ -0,0 +1,200 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; +import org.broadinstitute.sting.utils.SimpleTimer; + +import java.util.Iterator; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.CountDownLatch; + +/** + * Producer Thread that reads input values from an inputReads and puts them into an output queue + */ +class InputProducer implements Runnable { + private final static Logger logger = Logger.getLogger(InputProducer.class); + + /** + * The iterator we are using to get data from + */ + final Iterator inputReader; + + /** + * Our timer (may be null) that we use to track our input costs + */ + final SimpleTimer inputTimer; + + /** + * Where we put our input values for consumption + */ + final BlockingQueue outputQueue; + + final MultiThreadedErrorTracker errorTracker; + + /** + * Have we read the last value from inputReader? + * + * Must be a local variable, as inputReader.hasNext() can actually end up doing a lot + * of work, and the method getNumInputValues() is supposed to be called not in the + * thread executing the reading of values but in the thread enqueuing results + */ + boolean readLastValue = false; + + int nRead = 0; + int inputID = -1; + + /** + * A latch used to block threads that want to start up only when all of the values + * in inputReader have been read by the thread executing run() + */ + final CountDownLatch latch = new CountDownLatch(1); + + public InputProducer(final Iterator inputReader, + final MultiThreadedErrorTracker errorTracker, + final SimpleTimer inputTimer, + final BlockingQueue outputQueue) { + if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); + if ( errorTracker == null ) throw new IllegalArgumentException("errorTracker cannot be null"); + if ( inputTimer == null ) throw new IllegalArgumentException("inputTimer cannot be null"); + if ( outputQueue == null ) throw new IllegalArgumentException("OutputQueue cannot be null"); + + this.inputReader = inputReader; + this.errorTracker = errorTracker; + this.inputTimer = inputTimer; + this.outputQueue = outputQueue; + } + + /** + * Returns the number of elements in the input stream, AFTER we've read all of the values. + * If we haven't read them all yet, returns -1 + * + * @return the total number of elements in input stream, or -1 if some are still to be read + */ + public synchronized int getNumInputValues() { + return allInputsHaveBeenRead() ? nRead : -1; + } + + /** + * Returns true if all of the elements have been read from the input stream + * + * @return true if all of the elements have been read from the input stream + */ + public synchronized boolean allInputsHaveBeenRead() { + return readLastValue; + } + + /** + * Read the next item from the input stream, if possible + * + * If the inputReader has values, returns them, otherwise return null. + * + * This method is synchronized, as it manipulates local state accessed across multiple threads. + * + * @return the next input stream value, or null if the stream contains no more elements + * @throws InterruptedException + */ + private synchronized InputType readNextItem() throws InterruptedException { + inputTimer.restart(); + if ( ! inputReader.hasNext() ) { + // we are done, mark ourselves as such and return null + readLastValue = true; + inputTimer.stop(); + return null; + } else { + // get the next value, and return it + final InputType input = inputReader.next(); + inputTimer.stop(); + nRead++; + return input; + } + } + + /** + * Run this input producer, looping over all items in the input reader and + * enqueueing them as InputValues into the outputQueue. After the + * end of the stream has been encountered, any threads waiting because + * they called waitForDone() will be freed. + */ + public void run() { + try { + while ( true ) { + final InputType value = readNextItem(); + + if ( value == null ) { + // add the EOF object so our consumer knows we are done in all inputs + // note that we do not increase inputID here, so that variable indicates the ID + // of the last real value read from the queue + outputQueue.put(new InputValue(inputID + 1)); + break; + } else { + // add the actual value to the outputQueue + outputQueue.put(new InputValue(++inputID, value)); + } + } + + latch.countDown(); + } catch (Exception ex) { + errorTracker.notifyOfError(ex); + } + } + + /** + * Block until all of the items have been read from inputReader. + * + * Note that this call doesn't actually read anything. You have to submit a thread + * to actually execute run() directly. + * + * @throws InterruptedException + */ + public void waitForDone() throws InterruptedException { + latch.await(); + } + + /** + * Helper class that contains a read value suitable for EOF marking in a BlockingQueue + * + * This class also contains an ID, an integer incrementing from 0 to N, for N total + * values in the input stream. This ID indicates which element in the element stream this + * InputValue corresponds to. Necessary for tracking and ordering results by input position. + * + * Note that EOF markers have IDs > N, and ID values >> N can occur if many EOF markers + * are enqueued in the outputQueue. + */ + class InputValue extends EOFMarkedValue { + final int id; + + private InputValue(final int id, InputType datum) { + super(datum); + if ( id < 0 ) throw new IllegalArgumentException("id must be >= 0"); + this.id = id; + } + private InputValue(final int id) { + super(); + if ( id < 0 ) throw new IllegalArgumentException("id must be >= 0"); + this.id = id; + } + + /** + * Returns the ID of this input marker + * @return id >= 0 + */ + public int getId() { + return id; + } + + /** + * Create another EOF marker with ID + 1 to this one. + * + * Useful in the case where we need to enqueue another EOF marker for future jobs and we + * want them to have a meaningful ID, one greater than the last one. + * + * @return ID + */ + //@Ensures({"result.isEOFMarker()", "result.getId() == getId() + 1"}) + public InputValue nextEOF() { + if ( ! isEOFMarker() ) + throw new IllegalArgumentException("Cannot request next EOF marker for non-EOF marker InputValue"); + return new InputValue(getId() + 1); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java new file mode 100644 index 000000000..83d671560 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/MapResult.java @@ -0,0 +1,58 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Holds the results of a map job suitable for producer/consumer threading + * via a BlockingQueue + */ +class MapResult extends EOFMarkedValue implements Comparable> { + final int jobID; + + /** + * Create a new MapResult with value datum and jod jobID ID + * + * @param datum the value produced by the map job + * @param jobID the id of the map job (for correctness testing) + */ + MapResult(final MapType datum, final int jobID) { + super(datum); + this.jobID = jobID; + if ( jobID < 0 ) throw new IllegalArgumentException("JobID must be >= 0"); + } + + MapResult(final int jobID) { + super(); + this.jobID = jobID; + if ( jobID < 0 ) throw new IllegalArgumentException("JobID must be >= 0"); + } + + /** + * Create the EOF marker version of MapResult + */ + MapResult() { + super(); + this.jobID = Integer.MAX_VALUE; + } + + /** + * @return the job ID of the map job that produced this MapResult + */ + public int getJobID() { + return jobID; + } + + /** + * Compare these MapResults in order of JobID. + * + * @param o + * @return + */ + @Override + public int compareTo(MapResult o) { + return Integer.valueOf(jobID).compareTo(o.getJobID()); + } + + @Override + public String toString() { + return "[MapResult id=" + jobID + "]"; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java new file mode 100644 index 000000000..cc5335051 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSMapFunction.java @@ -0,0 +1,19 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * A function that maps from InputType -> ResultType + * + * For use with the NanoScheduler + * + * User: depristo + * Date: 8/24/12 + * Time: 9:49 AM + */ +public interface NSMapFunction { + /** + * Return function on input, returning a value of ResultType + * @param input + * @return + */ + public ResultType apply(final InputType input); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java new file mode 100644 index 000000000..8b12c62c4 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSProgressFunction.java @@ -0,0 +1,12 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * Created with IntelliJ IDEA. + * User: depristo + * Date: 9/4/12 + * Time: 2:10 PM + * To change this template use File | Settings | File Templates. + */ +public interface NSProgressFunction { + public void progress(final InputType lastMapInput); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java new file mode 100644 index 000000000..879a33a1d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSReduceFunction.java @@ -0,0 +1,18 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +/** + * A function that combines a value of MapType with an existing ReduceValue into a new ResultType + * + * User: depristo + * Date: 8/24/12 + * Time: 9:49 AM + */ +public interface NSReduceFunction { + /** + * Combine one with sum into a new ReduceType + * @param one the result of a map call on an input element + * @param sum the cumulative reduce result over all previous map calls + * @return + */ + public ReduceType apply(MapType one, ReduceType sum); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java new file mode 100644 index 000000000..0926b4c50 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NSRuntimeProfile.java @@ -0,0 +1,67 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.broadinstitute.sting.utils.SimpleTimer; + +/** + * Holds runtime profile (input, read, map) times as tracked by NanoScheduler + * + * User: depristo + * Date: 9/10/12 + * Time: 8:31 PM + */ +public class NSRuntimeProfile { + final SimpleTimer outsideSchedulerTimer = new SimpleTimer("outside"); + final SimpleTimer inputTimer = new SimpleTimer("input"); + final SimpleTimer mapTimer = new SimpleTimer("map"); + final SimpleTimer reduceTimer = new SimpleTimer("reduce"); + + /** + * Combine the elapsed time information from other with this profile + * + * @param other a non-null profile + */ + public void combine(final NSRuntimeProfile other) { + outsideSchedulerTimer.addElapsed(other.outsideSchedulerTimer); + inputTimer.addElapsed(other.inputTimer); + mapTimer.addElapsed(other.mapTimer); + reduceTimer.addElapsed(other.reduceTimer); + } + + /** + * Print the runtime profiling to logger + * + * @param logger + */ + public void log(final Logger logger) { + log1(logger, "Input time", inputTimer); + log1(logger, "Map time", mapTimer); + log1(logger, "Reduce time", reduceTimer); + log1(logger, "Outside time", outsideSchedulerTimer); + } + + /** + * @return the total runtime for all functions of this nano scheduler + */ + //@Ensures("result >= 0.0") + public double totalRuntimeInSeconds() { + return inputTimer.getElapsedTime() + + mapTimer.getElapsedTime() + + reduceTimer.getElapsedTime() + + outsideSchedulerTimer.getElapsedTime(); + } + + /** + * Print to logger.info timing information from timer, with name label + * + * @param label the name of the timer to display. Should be human readable + * @param timer the timer whose elapsed time we will display + */ + //@Requires({"label != null", "timer != null"}) + private void log1(final Logger logger, final String label, final SimpleTimer timer) { + final double myTimeInSec = timer.getElapsedTime(); + final double myTimePercent = myTimeInSec / totalRuntimeInSeconds() * 100; + logger.info(String.format("%s: %s (%5.2f%%)", label, new AutoFormattingTime(myTimeInSec), myTimePercent)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java new file mode 100644 index 000000000..d83a23c0f --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/NanoScheduler.java @@ -0,0 +1,519 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; +import org.broadinstitute.sting.utils.threading.NamedThreadFactory; + +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.*; + +/** + * Framework for very fine grained MapReduce parallelism + * + * The overall framework works like this + * + * nano <- new Nanoschedule(bufferSize, numberOfMapElementsToProcessTogether, nThreads) + * List[Input] outerData : outerDataLoop ) + * result = nano.execute(outerData.iterator(), map, reduce) + * + * bufferSize determines how many elements from the input stream are read in one go by the + * nanoscheduler. The scheduler may hold up to bufferSize in memory at one time, as well + * as up to bufferSize map results as well. + * + * numberOfMapElementsToProcessTogether determines how many input elements are processed + * together each thread cycle. For example, if this value is 10, then the input data + * is grouped together in units of 10 elements each, and map called on each in term. The more + * heavy-weight the map function is, in terms of CPU costs, the more it makes sense to + * have this number be small. The lighter the CPU cost per element, though, the more this + * parameter introduces overhead due to need to context switch among threads to process + * each input element. A value of -1 lets the nanoscheduler guess at a reasonable trade-off value. + * + * nThreads is a bit obvious yes? Note though that the nanoscheduler assumes that it gets 1 thread + * from its client during the execute call, as this call blocks until all work is done. The caller + * thread is put to work by execute to help with the processing of the data. So in reality the + * nanoScheduler only spawn nThreads - 1 additional workers (if this is > 1). + * + * User: depristo + * Date: 8/24/12 + * Time: 9:47 AM + */ +public class NanoScheduler { + private final static Logger logger = Logger.getLogger(NanoScheduler.class); + private final static boolean ALLOW_SINGLE_THREAD_FASTPATH = true; + private final static boolean LOG_MAP_TIMES = false; + + final int bufferSize; + final int nThreads; + final ExecutorService inputExecutor; + final ExecutorService masterExecutor; + final ExecutorService mapExecutor; + final Semaphore runningMapJobSlots; + final MultiThreadedErrorTracker errorTracker = new MultiThreadedErrorTracker(); + + boolean shutdown = false; + boolean debug = false; + private NSProgressFunction progressFunction = null; + + /** + * Tracks the combined runtime profiles across all created nano schedulers + */ + final static private NSRuntimeProfile combinedNSRuntimeProfiler = new NSRuntimeProfile(); + + /** + * The profile specific to this nano scheduler + */ + final private NSRuntimeProfile myNSRuntimeProfile = new NSRuntimeProfile(); + + /** + * Create a new nanoscheduler with the desire characteristics requested by the argument + * + * @param nThreads the number of threads to use to get work done, in addition to the + * thread calling execute + */ + public NanoScheduler(final int nThreads) { + this(nThreads*100, nThreads); + } + + protected NanoScheduler(final int bufferSize, final int nThreads) { + if ( bufferSize < 1 ) throw new IllegalArgumentException("bufferSize must be >= 1, got " + bufferSize); + if ( nThreads < 1 ) throw new IllegalArgumentException("nThreads must be >= 1, got " + nThreads); + + this.bufferSize = bufferSize; + this.nThreads = nThreads; + + if ( nThreads == 1 ) { + this.mapExecutor = this.inputExecutor = this.masterExecutor = null; + runningMapJobSlots = null; + } else { + this.mapExecutor = Executors.newFixedThreadPool(nThreads - 1, new NamedThreadFactory("NS-map-thread-%d")); + runningMapJobSlots = new Semaphore(this.bufferSize); + + this.inputExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-input-thread-%d")); + this.masterExecutor = Executors.newSingleThreadExecutor(new NamedThreadFactory("NS-master-thread-%d")); + } + + // start timing the time spent outside of the nanoScheduler + myNSRuntimeProfile.outsideSchedulerTimer.start(); + } + + /** + * The number of parallel map threads in use with this NanoScheduler + * @return + */ + @Ensures("result > 0") + public int getnThreads() { + return nThreads; + } + + /** + * The input buffer size used by this NanoScheduler + * @return + */ + @Ensures("result > 0") + public int getBufferSize() { + return this.bufferSize; + } + + /** + * Tells this nanoScheduler to shutdown immediately, releasing all its resources. + * + * After this call, execute cannot be invoked without throwing an error + */ + public void shutdown() { + myNSRuntimeProfile.outsideSchedulerTimer.stop(); + + // add my timing information to the combined NS runtime profile + combinedNSRuntimeProfiler.combine(myNSRuntimeProfile); + + if ( nThreads > 1 ) { + shutdownExecutor("inputExecutor", inputExecutor); + shutdownExecutor("mapExecutor", mapExecutor); + shutdownExecutor("masterExecutor", masterExecutor); + } + + shutdown = true; + } + + public void printRuntimeProfile() { + myNSRuntimeProfile.log(logger); + } + + public static void printCombinedRuntimeProfile() { + if ( combinedNSRuntimeProfiler.totalRuntimeInSeconds() > 0.1 ) + combinedNSRuntimeProfiler.log(logger); + } + + protected double getTotalRuntime() { + return myNSRuntimeProfile.totalRuntimeInSeconds(); + } + + /** + * Helper function to cleanly shutdown an execution service, checking that the execution + * state is clean when it's done. + * + * @param name a string name for error messages for the executorService we are shutting down + * @param executorService the executorService to shut down + */ + @Requires({"name != null", "executorService != null"}) + @Ensures("executorService.isShutdown()") + private void shutdownExecutor(final String name, final ExecutorService executorService) { + if ( executorService.isShutdown() || executorService.isTerminated() ) + throw new IllegalStateException("Executor service " + name + " is already shut down!"); + + final List remaining = executorService.shutdownNow(); + if ( ! remaining.isEmpty() ) + throw new IllegalStateException(remaining.size() + " remaining tasks found in an executor " + name + ", unexpected behavior!"); + } + + /** + * @return true if this nanoScheduler is shutdown, or false if its still open for business + */ + public boolean isShutdown() { + return shutdown; + } + + /** + * @return are we displaying verbose debugging information about the scheduling? + */ + public boolean isDebug() { + return debug; + } + + /** + * Helper function to display a String.formatted message if we are doing verbose debugging + * + * @param format the format argument suitable for String.format + * @param args the arguments for String.format + */ + @Requires("format != null") + protected void debugPrint(final String format, Object ... args) { + if ( isDebug() ) + logger.warn("Thread " + Thread.currentThread().getId() + ":" + String.format(format, args)); + } + + /** + * Turn on/off verbose debugging + * + * @param debug true if we want verbose debugging + */ + public void setDebug(boolean debug) { + this.debug = debug; + } + + /** + * Set the progress callback function to progressFunction + * + * The progress callback is invoked after each buffer size elements have been processed by map/reduce + * + * @param progressFunction a progress function to call, or null if you don't want any progress callback + */ + public void setProgressFunction(final NSProgressFunction progressFunction) { + this.progressFunction = progressFunction; + } + + /** + * Execute a map/reduce job with this nanoScheduler + * + * Data comes from inputReader. Will be read until hasNext() == false. + * map is called on each element provided by inputReader. No order of operations is guarenteed + * reduce is called in order of the input data provided by inputReader on the result of map() applied + * to each element. + * + * Note that the caller thread is put to work with this function call. The call doesn't return + * until all elements have been processes. + * + * It is safe to call this function repeatedly on a single nanoScheduler, at least until the + * shutdown method is called. + * + * Note that this function goes through a single threaded fast path if the number of threads + * is 1. + * + * @param inputReader an iterator providing us with the input data to nanoSchedule map/reduce over + * @param map the map function from input type -> map type, will be applied in parallel to each input + * @param reduce the reduce function from map type + reduce type -> reduce type to be applied in order to map results + * @return the last reduce value + */ + public ReduceType execute(final Iterator inputReader, + final NSMapFunction map, + final ReduceType initialValue, + final NSReduceFunction reduce) { + if ( isShutdown() ) throw new IllegalStateException("execute called on already shutdown NanoScheduler"); + if ( inputReader == null ) throw new IllegalArgumentException("inputReader cannot be null"); + if ( map == null ) throw new IllegalArgumentException("map function cannot be null"); + if ( reduce == null ) throw new IllegalArgumentException("reduce function cannot be null"); + + myNSRuntimeProfile.outsideSchedulerTimer.stop(); + + ReduceType result; + if ( ALLOW_SINGLE_THREAD_FASTPATH && getnThreads() == 1 ) { + result = executeSingleThreaded(inputReader, map, initialValue, reduce); + } else { + result = executeMultiThreaded(inputReader, map, initialValue, reduce); + } + + myNSRuntimeProfile.outsideSchedulerTimer.restart(); + return result; + } + + /** + * Simple efficient reference implementation for single threaded execution. + * + * @return the reduce result of this map/reduce job + */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) + private ReduceType executeSingleThreaded(final Iterator inputReader, + final NSMapFunction map, + final ReduceType initialValue, + final NSReduceFunction reduce) { + ReduceType sum = initialValue; + int i = 0; + + while ( true ) { + // start timer to ensure that both hasNext and next are caught by the timer + myNSRuntimeProfile.inputTimer.restart(); + if ( ! inputReader.hasNext() ) { + myNSRuntimeProfile.inputTimer.stop(); + break; + } else { + final InputType input = inputReader.next(); + myNSRuntimeProfile.inputTimer.stop(); + + // map + myNSRuntimeProfile.mapTimer.restart(); + final long preMapTime = LOG_MAP_TIMES ? 0 : myNSRuntimeProfile.mapTimer.currentTimeNano(); + final MapType mapValue = map.apply(input); + if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (myNSRuntimeProfile.mapTimer.currentTimeNano() - preMapTime)); + myNSRuntimeProfile.mapTimer.stop(); + + if ( i++ % this.bufferSize == 0 && progressFunction != null ) + progressFunction.progress(input); + + // reduce + myNSRuntimeProfile.reduceTimer.restart(); + sum = reduce.apply(mapValue, sum); + myNSRuntimeProfile.reduceTimer.stop(); + } + } + + return sum; + } + + /** + * Efficient parallel version of Map/Reduce + * + * @return the reduce result of this map/reduce job + */ + @Requires({"inputReader != null", "map != null", "reduce != null"}) + private ReduceType executeMultiThreaded(final Iterator inputReader, + final NSMapFunction map, + final ReduceType initialValue, + final NSReduceFunction reduce) { + debugPrint("Executing nanoScheduler"); + + // start up the master job + final MasterJob masterJob = new MasterJob(inputReader, map, initialValue, reduce); + final Future reduceResult = masterExecutor.submit(masterJob); + + while ( true ) { + // check that no errors occurred while we were waiting + handleErrors(); + + try { + final ReduceType result = reduceResult.get(100, TimeUnit.MILLISECONDS); + + // in case an error occurred in the reduce + handleErrors(); + + // return our final reduce result + return result; + } catch (final TimeoutException ex ) { + // a normal case -- we just aren't done + } catch (final InterruptedException ex) { + errorTracker.notifyOfError(ex); + // will handle error in the next round of the for loop + } catch (final ExecutionException ex) { + errorTracker.notifyOfError(ex); + // will handle error in the next round of the for loop + } + } + } + + private void handleErrors() { + if ( errorTracker.hasAnErrorOccurred() ) { + masterExecutor.shutdownNow(); + mapExecutor.shutdownNow(); + inputExecutor.shutdownNow(); + errorTracker.throwErrorIfPending(); + } + } + + /** + * MasterJob has the task to enqueue Map jobs and wait for the final reduce + * + * It must be run in a separate thread in order to properly handle errors that may occur + * in the input, map, or reduce jobs without deadlocking. + * + * The result of this callable is the final reduce value for the input / map / reduce jobs + */ + private class MasterJob implements Callable { + final Iterator inputReader; + final NSMapFunction map; + final ReduceType initialValue; + final NSReduceFunction reduce; + + private MasterJob(Iterator inputReader, NSMapFunction map, ReduceType initialValue, NSReduceFunction reduce) { + this.inputReader = inputReader; + this.map = map; + this.initialValue = initialValue; + this.reduce = reduce; + } + + @Override + public ReduceType call() { + // a blocking queue that limits the number of input datum to the requested buffer size + // note we need +1 because we continue to enqueue the lastObject + final BlockingQueue.InputValue> inputQueue + = new LinkedBlockingDeque.InputValue>(bufferSize+1); + + // Create the input producer and start it running + final InputProducer inputProducer = + new InputProducer(inputReader, errorTracker, myNSRuntimeProfile.inputTimer, inputQueue); + inputExecutor.submit(inputProducer); + + // a priority queue that stores up to bufferSize elements + // produced by completed map jobs. + final PriorityBlockingQueue> mapResultQueue = + new PriorityBlockingQueue>(); + + final Reducer reducer + = new Reducer(reduce, errorTracker, myNSRuntimeProfile.reduceTimer, initialValue); + + try { + int nSubmittedJobs = 0; + + while ( continueToSubmitJobs(nSubmittedJobs, inputProducer) ) { + // acquire a slot to run a map job. Blocks if too many jobs are enqueued + runningMapJobSlots.acquire(); + + mapExecutor.submit(new MapReduceJob(inputQueue, mapResultQueue, map, reducer)); + nSubmittedJobs++; + } + + // mark the last job id we've submitted so we now the id to wait for + //logger.warn("setting jobs submitted to " + nSubmittedJobs); + reducer.setTotalJobCount(nSubmittedJobs); + + // wait for all of the input and map threads to finish + return waitForCompletion(inputProducer, reducer); + } catch (Exception ex) { + errorTracker.notifyOfError(ex); + return initialValue; + } + } + + /** + * Wait until the input thread and all map threads have completed running, and return the final reduce result + */ + private ReduceType waitForCompletion(final InputProducer inputProducer, + final Reducer reducer) throws InterruptedException { + // wait until we have a final reduce result +// logger.warn("waiting for final reduce"); + final ReduceType finalSum = reducer.waitForFinalReduce(); + + // now wait for the input provider thread to terminate +// logger.warn("waiting on inputProducer"); + inputProducer.waitForDone(); + + // wait for all the map threads to finish by acquiring and then releasing all map job semaphores +// logger.warn("waiting on map"); + runningMapJobSlots.acquire(bufferSize); + runningMapJobSlots.release(bufferSize); + + // everything is finally shutdown, return the final reduce value + return finalSum; + } + + /** + * Should we continue to submit jobs given the number of jobs already submitted and the + * number of read items in inputProducer? + * + * We continue to submit jobs while inputProducer hasn't reached EOF or the number + * of jobs we've enqueued isn't the number of read elements. This means that in + * some cases we submit more jobs than total read elements (cannot know because of + * multi-threading) so map jobs must handle the case where getNext() returns EOF. + * + * @param nJobsSubmitted + * @param inputProducer + * @return + */ + private boolean continueToSubmitJobs(final int nJobsSubmitted, final InputProducer inputProducer) { + final int nReadItems = inputProducer.getNumInputValues(); + return nReadItems == -1 || nJobsSubmitted < nReadItems; + } + } + + private class MapReduceJob implements Runnable { + final BlockingQueue.InputValue> inputQueue; + final PriorityBlockingQueue> mapResultQueue; + final NSMapFunction map; + final Reducer reducer; + + private MapReduceJob(BlockingQueue.InputValue> inputQueue, + final PriorityBlockingQueue> mapResultQueue, + final NSMapFunction map, + final Reducer reducer) { + this.inputQueue = inputQueue; + this.mapResultQueue = mapResultQueue; + this.map = map; + this.reducer = reducer; + } + + @Override + public void run() { + try { + //debugPrint("Running MapReduceJob " + jobID); + final InputProducer.InputValue inputWrapper = inputQueue.take(); + final int jobID = inputWrapper.getId(); + + final MapResult result; + if ( ! inputWrapper.isEOFMarker() ) { + // just skip doing anything if we don't have work to do, which is possible + // because we don't necessarily know how much input there is when we queue + // up our jobs + final InputType input = inputWrapper.getValue(); + + // map + myNSRuntimeProfile.mapTimer.restart(); + final long preMapTime = LOG_MAP_TIMES ? 0 : myNSRuntimeProfile.mapTimer.currentTimeNano(); + final MapType mapValue = map.apply(input); + if ( LOG_MAP_TIMES ) logger.info("MAP TIME " + (myNSRuntimeProfile.mapTimer.currentTimeNano() - preMapTime)); + myNSRuntimeProfile.mapTimer.stop(); + + // enqueue the result into the mapResultQueue + result = new MapResult(mapValue, jobID); + + if ( jobID % bufferSize == 0 && progressFunction != null ) + progressFunction.progress(input); + } else { + // push back the EOF marker so other waiting threads can read it + inputQueue.put(inputWrapper.nextEOF()); + + // if there's no input we push empty MapResults with jobIDs for synchronization with Reducer + result = new MapResult(jobID); + } + + mapResultQueue.put(result); + + final int nReduced = reducer.reduceAsMuchAsPossible(mapResultQueue); + } catch (Exception ex) { + errorTracker.notifyOfError(ex); + } finally { + // we finished a map job, release the job queue semaphore + runningMapJobSlots.release(); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java new file mode 100644 index 000000000..92c1018eb --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/nanoScheduler/Reducer.java @@ -0,0 +1,209 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; +import org.broadinstitute.sting.utils.SimpleTimer; + +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.PriorityBlockingQueue; + +/** + * Reducer supporting two-threaded reduce of the map/reduce. + * + * The first thread, using the reduceAsMuchAsPossible function, actually reduces the data + * as it arrives in the blockingQueue. + * + * The second thread, using the waitForFinalReduce, can block on this data structure + * until that all jobs have arrived and been reduced. + * + * The key function for communication here is setTotalJobCount(), which the thread that submits + * jobs that enqueue MapResults into the blocking queue must call ONCE to tell the + * Reducer the total number of jobs that have been submitted for map. When numOfSubmittedJobs + * have been processed, this class frees a latch that allows thread blocked on waitForFinalReduce to proceed. + * + * This thread reads from mapResultsQueue until the poison EOF object arrives. At each + * stage is calls reduce(value, sum). The blocking mapResultQueue ensures that the + * queue waits until the mapResultQueue has a value to take. Then, it gets and waits + * until the map result Future has a value. + */ +class Reducer { + private final static Logger logger = Logger.getLogger(Reducer.class); + private final static int UNSET_NUM_SUBMITTED_JOBS = -2; + + final CountDownLatch countDownLatch = new CountDownLatch(1); + final NSReduceFunction reduce; + final SimpleTimer reduceTimer; + final MultiThreadedErrorTracker errorTracker; + + /** + * The sum of the reduce function applied to all MapResults. After this Reducer + * is done sum contains the final reduce result. + */ + ReduceType sum; + + int numSubmittedJobs = UNSET_NUM_SUBMITTED_JOBS; // not yet set + + /** + * The jobID of the last job we've seen + */ + int prevJobID = -1; // no jobs observed + + /** + * A counter keeping track of the number of jobs we're reduced + */ + int numJobsReduced = 0; + + /** + * Create a new Reducer that will apply the reduce function with initialSum value + * to values via reduceAsMuchAsPossible, timing the reduce function call costs with + * reduceTimer + * + * @param reduce the reduce function to apply + * @param reduceTimer the timer to time the reduce function call + * @param initialSum the initial reduce sum + */ + public Reducer(final NSReduceFunction reduce, + final MultiThreadedErrorTracker errorTracker, + final SimpleTimer reduceTimer, + final ReduceType initialSum) { + if ( errorTracker == null ) throw new IllegalArgumentException("Error tracker cannot be null"); + if ( reduce == null ) throw new IllegalArgumentException("Reduce function cannot be null"); + if ( reduceTimer == null ) throw new IllegalArgumentException("reduceTimer cannot be null"); + + this.errorTracker = errorTracker; + this.reduce = reduce; + this.reduceTimer = reduceTimer; + this.sum = initialSum; + } + + /** + * Should we reduce the next value in the mapResultQueue? + * + * @param mapResultQueue the queue of map results + * @return true if we should reduce + */ + @Requires("mapResultQueue != null") + private synchronized boolean reduceNextValueInQueue(final PriorityBlockingQueue> mapResultQueue) { + final MapResult nextMapResult = mapResultQueue.peek(); + if ( nextMapResult == null ) { + return false; + } else if ( nextMapResult.getJobID() < prevJobID + 1 ) { + throw new IllegalStateException("Next job ID " + nextMapResult.getJobID() + " is < previous job id " + prevJobID); + } else if ( nextMapResult.getJobID() == prevJobID + 1 ) { + return true; + } else { + return false; + } + } + + /** + * Reduce as much data as possible in mapResultQueue, returning the number of reduce calls completed + * + * As much as possible is defined as all of the MapResults in the queue are in order starting from the + * numSubmittedJobs we reduced previously, up to the either the queue being empty or where the next MapResult + * doesn't have JobID == prevJobID + 1. + * + * @param mapResultQueue a queue of MapResults in jobID order + * @return the number of reduces run, from 0 > + * @throws InterruptedException + */ + @Ensures("result >= 0") + public synchronized int reduceAsMuchAsPossible(final PriorityBlockingQueue> mapResultQueue) { + if ( mapResultQueue == null ) throw new IllegalArgumentException("mapResultQueue cannot be null"); + int nReducesNow = 0; + +// if ( numSubmittedJobs != UNSET_NUM_SUBMITTED_JOBS ) +// logger.warn(" maybeReleaseLatch " + numJobsReduced + " numSubmittedJobs " + numSubmittedJobs + " queue " + mapResultQueue.size()); + try { + while ( reduceNextValueInQueue(mapResultQueue) ) { + final MapResult result = mapResultQueue.take(); + prevJobID = result.getJobID(); + + if ( ! result.isEOFMarker() ) { + nReducesNow++; + + // apply reduce, keeping track of sum + reduceTimer.restart(); + sum = reduce.apply(result.getValue(), sum); + reduceTimer.stop(); + + } + + numJobsReduced++; + maybeReleaseLatch(); + } + } catch (Exception ex) { + errorTracker.notifyOfError(ex); + countDownLatch.countDown(); + } +// if ( numSubmittedJobs == UNSET_NUM_SUBMITTED_JOBS ) +// logger.warn(" maybeReleaseLatch " + numJobsReduced + " numSubmittedJobs " + numSubmittedJobs + " queue " + mapResultQueue.size()); + + return nReducesNow; + } + + /** + * release the latch if appropriate + * + * Appropriate means we've seen the last job, or there's only a single job id + */ + private synchronized void maybeReleaseLatch() { + if ( numJobsReduced == numSubmittedJobs ) { + // either we've already seen the last one prevJobID == numSubmittedJobs or + // the last job ID is -1, meaning that no jobs were ever submitted + countDownLatch.countDown(); + } + } + + /** + * For testing only + * + * @return true if latch is released + */ + protected synchronized boolean latchIsReleased() { + return countDownLatch.getCount() == 0; + } + + /** + * Key function: tell this class the total number of jobs will provide data in the mapResultsQueue + * + * The total job count when we free threads blocked on waitForFinalReduce. When we see numOfSubmittedJobs + * MapResults from the queue, those threads are released. + * + * Until this function is called, those thread will block forever. The numOfSubmittedJobs has a few constraints. + * First, it must be >= 0. 0 indicates that in fact no jobs will ever be submitted (i.e., there's no + * data coming) so the latch should be opened immediately. If it's >= 1, we will wait until + * we see numOfSubmittedJobs jobs before freeing them. + * + * Note that we throw an IllegalStateException if this function is called twice. + * + * @param numOfSubmittedJobs int >= 0 indicating the total number of MapResults that will + * enqueue results into our queue + */ + public synchronized void setTotalJobCount(final int numOfSubmittedJobs) { + if ( numOfSubmittedJobs < 0 ) + throw new IllegalArgumentException("numOfSubmittedJobs must be >= 0, but saw " + numOfSubmittedJobs); + if ( this.numSubmittedJobs != UNSET_NUM_SUBMITTED_JOBS) + throw new IllegalStateException("setlastJobID called multiple times, but should only be called once"); + + //logger.warn("setTotalJobCount " + numJobsReduced + " numSubmitted " + numOfSubmittedJobs); + this.numSubmittedJobs = numOfSubmittedJobs; + maybeReleaseLatch(); + } + + /** + * Block until the last job has submitted its MapResult to our queue, and we've reduced it, and + * return the reduce result resulting from applying reduce(...) to all MapResult elements. + * + * @return the total reduce result across all jobs + * @throws InterruptedException + */ + public ReduceType waitForFinalReduce() throws InterruptedException { + //logger.warn("waitForFinalReduce() " + numJobsReduced + " " + numSubmittedJobs); + countDownLatch.await(); + //logger.warn(" done waitForFinalReduce"); + return sum; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java new file mode 100644 index 000000000..17089ee81 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/ExactPairHMM.java @@ -0,0 +1,107 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +import java.util.ArrayList; +import java.util.Arrays; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public class ExactPairHMM extends PairHMM { + + @Override + public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2; + final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2; + + matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; + + for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) { + Arrays.fill(matchMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(XMetricArray[iii], Double.NEGATIVE_INFINITY); + Arrays.fill(YMetricArray[iii], Double.NEGATIVE_INFINITY); + } + + // the initial condition + matchMetricArray[1][1] = 0.0; // Math.log10(1.0); + } + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.log10sumLog10(new double[]{matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]}); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0}); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1}); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.log10sumLog10(new double[]{matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2}); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java new file mode 100644 index 000000000..cd946cdf1 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/OriginalPairHMM.java @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.pairhmm; + +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; + +/** + * Util class for performing the pair HMM for local alignment. Figure 4.3 in Durbin 1998 book. + * User: rpoplin + * Date: 3/1/12 + */ + +public class OriginalPairHMM extends ExactPairHMM { + + @Override + public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ) { + + // M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment + final int X_METRIC_LENGTH = readBases.length + 2; + final int Y_METRIC_LENGTH = haplotypeBases.length + 2; + + // ensure that all the qual scores have valid values + for( int iii = 0; iii < readQuals.length; iii++ ) { + readQuals[iii] = ( readQuals[iii] < QualityUtils.MIN_USABLE_Q_SCORE ? QualityUtils.MIN_USABLE_Q_SCORE : (readQuals[iii] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : readQuals[iii]) ); + } + + // simple rectangular version of update loop, slow + for( int iii = 1; iii < X_METRIC_LENGTH; iii++ ) { + for( int jjj = hapStartIndex + 1; jjj < Y_METRIC_LENGTH; jjj++ ) { + if( (iii == 1 && jjj == 1) ) { continue; } + updateCell(iii, jjj, haplotypeBases, readBases, readQuals, insertionGOP, deletionGOP, overallGCP, + matchMetricArray, XMetricArray, YMetricArray); + } + } + + // final probability is the log10 sum of the last element in all three state arrays + final int endI = X_METRIC_LENGTH - 1; + final int endJ = Y_METRIC_LENGTH - 1; + return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]); + } + + private void updateCell( final int indI, final int indJ, final byte[] haplotypeBases, final byte[] readBases, + final byte[] readQuals, final byte[] insertionGOP, final byte[] deletionGOP, final byte[] overallGCP, + final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) { + + // the read and haplotype indices are offset by one because the state arrays have an extra column to hold the initial conditions + final int im1 = indI - 1; + final int jm1 = indJ - 1; + + // update the match array + double pBaseReadLog10 = 0.0; // Math.log10(1.0); + if( im1 > 0 && jm1 > 0 ) { // the emission probability is applied when leaving the state + final byte x = readBases[im1-1]; + final byte y = haplotypeBases[jm1-1]; + final byte qual = readQuals[im1-1]; + pBaseReadLog10 = ( x == y || x == (byte) 'N' || y == (byte) 'N' ? QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) ); + } + final int qualIndexGOP = ( im1 == 0 ? DEFAULT_GOP + DEFAULT_GOP : ( insertionGOP[im1-1] + deletionGOP[im1-1] > MAX_CACHED_QUAL ? MAX_CACHED_QUAL : insertionGOP[im1-1] + deletionGOP[im1-1]) ); + final double d0 = QualityUtils.qualToProbLog10((byte)qualIndexGOP); + final double e0 = ( im1 == 0 ? QualityUtils.qualToProbLog10(DEFAULT_GCP) : QualityUtils.qualToProbLog10(overallGCP[im1-1]) ); + matchMetricArray[indI][indJ] = pBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ-1] + d0, XMetricArray[indI-1][indJ-1] + e0, YMetricArray[indI-1][indJ-1] + e0); + + // update the X (insertion) array + final double d1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GOP) : QualityUtils.qualToErrorProbLog10(insertionGOP[im1-1]) ); + final double e1 = ( im1 == 0 ? QualityUtils.qualToErrorProbLog10(DEFAULT_GCP) : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseReadLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + XMetricArray[indI][indJ] = qBaseReadLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI-1][indJ] + d1, XMetricArray[indI-1][indJ] + e1); + + // update the Y (deletion) array, with penalty of zero on the left and right flanks to allow for a local alignment within the haplotype + final double d2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(deletionGOP[im1-1]) ); + final double e2 = ( im1 == 0 || im1 == readBases.length ? 0.0 : QualityUtils.qualToErrorProbLog10(overallGCP[im1-1]) ); + final double qBaseRefLog10 = 0.0; // Math.log10(1.0) -- we don't have an estimate for this emission probability so assume q=1.0 + YMetricArray[indI][indJ] = qBaseRefLog10 + MathUtils.approximateLog10SumLog10(matchMetricArray[indI][indJ-1] + d2, YMetricArray[indI][indJ-1] + e2); + } +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java new file mode 100644 index 000000000..7a1399c32 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/pairhmm/PairHMM.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.utils.pairhmm; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +/** + * Created with IntelliJ IDEA. + * User: rpoplin + * Date: 10/16/12 + */ + +public abstract class PairHMM { + protected static final Byte MAX_CACHED_QUAL = Byte.MAX_VALUE; + protected static final byte DEFAULT_GOP = (byte) 45; + protected static final byte DEFAULT_GCP = (byte) 10; + + public enum HMM_IMPLEMENTATION { + /* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */ + EXACT, + /* PairHMM as implemented for the UnifiedGenotyper. Uses log10 sum functions accurate to only 1E-4 */ + ORIGINAL, + /* Optimized version of the PairHMM which caches per-read computations */ + CACHING, + /* Optimized version of the PairHMM which caches per-read computations and operations in real space to avoid costly sums of log10'ed likelihoods */ + LOGLESS_CACHING + } + + protected double[][] matchMetricArray = null; + protected double[][] XMetricArray = null; + protected double[][] YMetricArray = null; + + public abstract void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ); + + @Requires({"readBases.length == readQuals.length", "readBases.length == insertionGOP.length", "readBases.length == deletionGOP.length", + "readBases.length == overallGCP.length", "matchMetricArray!=null", "XMetricArray!=null", "YMetricArray!=null"}) + @Ensures({"!Double.isInfinite(result)", "!Double.isNaN(result)"}) // Result should be a proper log10 likelihood + public abstract double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases, + final byte[] readBases, + final byte[] readQuals, + final byte[] insertionGOP, + final byte[] deletionGOP, + final byte[] overallGCP, + final int hapStartIndex, + final boolean recacheReadValues ); +} diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 3d986f666..ed6fc46bb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -613,6 +613,8 @@ public abstract class AbstractReadBackedPileup= 0", + "progressPrintFrequency > 0" +}) +public class ProgressMeter { + protected static final Logger logger = Logger.getLogger(ProgressMeter.class); + + // -------------------------------------------------------------------------------- + // static constants controlling overall system behavior + // -------------------------------------------------------------------------------- + + /** + * Min. milliseconds after we start up the meter before we will print our first meter message + */ + private final static long MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS = 30 * 1000; + + /** + * How often should we print performance logging information, when we are sending this + * information to a file? Not dynamically updated as the logger meter is. + */ + private final static long PERFORMANCE_LOG_PRINT_FREQUENCY = 10 * 1000; + + private final static double TWO_HOURS_IN_SECONDS = 2.0 * 60.0 * 60.0; + private final static double TWELVE_HOURS_IN_SECONDS = 12.0 * 60.0 * 60.0; + + // -------------------------------------------------------------------------------- + // Variables we updating during running + // -------------------------------------------------------------------------------- + + /** + * When was the last time we printed progress log? In milleseconds + */ + private long lastProgressPrintTime = -1; + + /** + * How frequently should we be printing our meter messages? Dynamically updated + * depending on how long we think the run has left. + */ + private long progressPrintFrequency = 10 * 1000; // default value + + /** + * When was the last time we printed to the performance log? In millseconds + */ + private long lastPerformanceLogPrintTime = -1; + + // -------------------------------------------------------------------------------- + // final variables fixed at object creation time + // -------------------------------------------------------------------------------- + + /** + * The set of genome locs describing the total region we are processing with + * this GATK run. Used to determine how close we are to completing the run + */ + private final GenomeLocSortedSet regionsBeingProcessed; + + /** + * Size, in bp, of the area we are processing, derived from regionsBeingProcessed. + * Updated once in the system in initial for performance reasons + */ + private final long targetSizeInBP; + + /** + * A string describing the type of units being processes, so we can say things like + * "we are running at X processingUnitName per second" + */ + private final String processingUnitName; + + /** + * A potentially null file where we print a supplementary, R readable performance log + * file. + */ + private final PrintStream performanceLog; + + /** We use the SimpleTimer to time our run */ + private final SimpleTimer timer = new SimpleTimer(); + + /** + * Create a new ProgressMeter + * + * @param performanceLogFile an optional performance log file where a table of performance logs will be written + * @param processingUnitName the name of the unit type being processed, suitable for saying X seconds per processingUnitName + * @param processingIntervals the intervals being processed + */ + public ProgressMeter(final File performanceLogFile, + final String processingUnitName, + final GenomeLocSortedSet processingIntervals) { + if ( processingUnitName == null ) throw new IllegalArgumentException("processingUnitName cannot be null"); + if ( processingIntervals == null ) throw new IllegalArgumentException("Target intervals cannot be null"); + + this.processingUnitName = processingUnitName; + this.regionsBeingProcessed = processingIntervals; + + // setup the performance logger output, if requested + if ( performanceLogFile != null ) { + try { + this.performanceLog = new PrintStream(new FileOutputStream(performanceLogFile)); + final List pLogHeader = Arrays.asList("elapsed.time", "units.processed", "processing.speed", + "bp.processed", "bp.speed", "genome.fraction.complete", "est.total.runtime", "est.time.remaining"); + performanceLog.println(Utils.join("\t", pLogHeader)); + } catch (FileNotFoundException e) { + throw new UserException.CouldNotCreateOutputFile(performanceLogFile, e); + } + } else { + performanceLog = null; + } + + // cached for performance reasons + targetSizeInBP = processingIntervals.coveredSize(); + + // start up the timer + start(); + } + + /** + * Forward request to notifyOfProgress + * + * Assumes that one cycle has been completed + * + * @param loc our current location. Null means "in unmapped reads" + * @param nTotalRecordsProcessed the total number of records we've processed + */ + public void notifyOfProgress(final GenomeLoc loc, final long nTotalRecordsProcessed) { + notifyOfProgress(loc, false, nTotalRecordsProcessed); + } + + private synchronized void start() { + timer.start(); + lastProgressPrintTime = timer.currentTime(); + + logger.info("[INITIALIZATION COMPLETE; STARTING PROCESSING]"); + logger.info(String.format("%15s processed.%s runtime per.1M.%s completed total.runtime remaining", + "Location", processingUnitName, processingUnitName)); + } + + /** + * @return the current runtime in nanoseconds + */ + @Ensures("result >= 0") + public long getRuntimeInNanoseconds() { + return timer.getElapsedTimeNano(); + } + + /** + * Utility routine that prints out process information (including timing) every N records or + * every M seconds, for N and M set in global variables. + * + * Synchronized to ensure that even with multiple threads calling notifyOfProgress we still + * get one clean stream of meter logs. + * + * @param loc Current location, can be null if you are at the end of the processing unit + * @param mustPrint If true, will print out info, regardless of time interval + * @param nTotalRecordsProcessed the total number of records we've processed + */ + private synchronized void notifyOfProgress(final GenomeLoc loc, boolean mustPrint, final long nTotalRecordsProcessed) { + if ( nTotalRecordsProcessed < 0 ) throw new IllegalArgumentException("nTotalRecordsProcessed must be >= 0"); + + final long curTime = timer.currentTime(); + final boolean printProgress = mustPrint || maxElapsedIntervalForPrinting(curTime, lastProgressPrintTime, progressPrintFrequency); + final boolean printLog = performanceLog != null && maxElapsedIntervalForPrinting(curTime, lastPerformanceLogPrintTime, PERFORMANCE_LOG_PRINT_FREQUENCY); + + if ( printProgress || printLog ) { + final ProgressMeterData progressData = takeProgressSnapshot(loc, nTotalRecordsProcessed); + + final AutoFormattingTime elapsed = new AutoFormattingTime(progressData.getElapsedSeconds()); + final AutoFormattingTime bpRate = new AutoFormattingTime(progressData.secondsPerMillionBP()); + final AutoFormattingTime unitRate = new AutoFormattingTime(progressData.secondsPerMillionElements()); + final double fractionGenomeTargetCompleted = progressData.calculateFractionGenomeTargetCompleted(targetSizeInBP); + final AutoFormattingTime estTotalRuntime = new AutoFormattingTime(elapsed.getTimeInSeconds() / fractionGenomeTargetCompleted); + final AutoFormattingTime timeToCompletion = new AutoFormattingTime(estTotalRuntime.getTimeInSeconds() - elapsed.getTimeInSeconds()); + + if ( printProgress ) { + lastProgressPrintTime = curTime; + updateLoggerPrintFrequency(estTotalRuntime.getTimeInSeconds()); + + // a pretty name for our position + final String posName = loc == null + ? (mustPrint ? "done" : "unmapped reads") + : String.format("%s:%d", loc.getContig(), loc.getStart()); + + logger.info(String.format("%15s %5.2e %s %s %5.1f%% %s %s", + posName, progressData.getUnitsProcessed()*1.0, elapsed, unitRate, + 100*fractionGenomeTargetCompleted, estTotalRuntime, timeToCompletion)); + + } + + if ( printLog ) { + lastPerformanceLogPrintTime = curTime; + performanceLog.printf("%.2f\t%d\t%.2e\t%d\t%.2e\t%.2e\t%.2f\t%.2f%n", + elapsed.getTimeInSeconds(), progressData.getUnitsProcessed(), unitRate.getTimeInSeconds(), + progressData.getBpProcessed(), bpRate.getTimeInSeconds(), + fractionGenomeTargetCompleted, estTotalRuntime.getTimeInSeconds(), + timeToCompletion.getTimeInSeconds()); + } + } + } + + /** + * Determine, based on remaining runtime, how often to print the meter + * + * @param totalRuntimeSeconds kinda obvious, no? + */ + private void updateLoggerPrintFrequency(final double totalRuntimeSeconds) { + // dynamically change the update rate so that short running jobs receive frequent updates while longer jobs receive fewer updates + if ( totalRuntimeSeconds > TWELVE_HOURS_IN_SECONDS ) + progressPrintFrequency = 60 * 1000; // in milliseconds + else if ( totalRuntimeSeconds > TWO_HOURS_IN_SECONDS ) + progressPrintFrequency = 30 * 1000; // in milliseconds + else + progressPrintFrequency = 10 * 1000; // in milliseconds + } + + /** + * Creates a new ProgressData object recording a snapshot of our progress at this instant + * + * @param loc our current position. If null, assumes we are done traversing + * @param nTotalRecordsProcessed the total number of records we've processed + * @return + */ + private ProgressMeterData takeProgressSnapshot(final GenomeLoc loc, final long nTotalRecordsProcessed) { + // null -> end of processing + final long bpProcessed = loc == null ? targetSizeInBP : regionsBeingProcessed.sizeBeforeLoc(loc); + return new ProgressMeterData(timer.getElapsedTime(), nTotalRecordsProcessed, bpProcessed); + } + + /** + * Should be called when processing is done + */ + public void notifyDone(final long nTotalRecordsProcessed) { + // print out the progress meter + notifyOfProgress(null, true, nTotalRecordsProcessed); + + logger.info(String.format("Total runtime %.2f secs, %.2f min, %.2f hours", + timer.getElapsedTime(), timer.getElapsedTime() / 60, timer.getElapsedTime() / 3600)); + + if ( performanceLog != null ) + performanceLog.close(); + } + + /** + * @param curTime (current runtime, in millisecs) + * @param lastPrintTime the last time we printed, in machine milliseconds + * @param printFreq maximum permitted difference between last print and current times + * + * @return true if the maximum interval (in millisecs) has passed since the last printing + */ + private boolean maxElapsedIntervalForPrinting(final long curTime, long lastPrintTime, long printFreq) { + final long elapsed = curTime - lastPrintTime; + return elapsed > printFreq && elapsed > MIN_ELAPSED_TIME_BEFORE_FIRST_PROGRESS; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java new file mode 100644 index 000000000..096b55be2 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterData.java @@ -0,0 +1,54 @@ +package org.broadinstitute.sting.utils.progressmeter; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; + +/** + * a snapshot of our performance, suitable for storage and later analysis + */ +class ProgressMeterData { + private final double elapsedSeconds; + private final long unitsProcessed; + private final long bpProcessed; + + @Requires({"unitsProcessed >= 0", "bpProcessed >= 0", "elapsedSeconds >= 0"}) + public ProgressMeterData(double elapsedSeconds, long unitsProcessed, long bpProcessed) { + this.elapsedSeconds = elapsedSeconds; + this.unitsProcessed = unitsProcessed; + this.bpProcessed = bpProcessed; + } + + @Ensures("result >= 0.0") + public double getElapsedSeconds() { + return elapsedSeconds; + } + + @Ensures("result >= 0") + public long getUnitsProcessed() { + return unitsProcessed; + } + + @Ensures("result >= 0") + public long getBpProcessed() { + return bpProcessed; + } + + /** How long in seconds to process 1M traversal units? */ + @Ensures("result >= 0.0") + public double secondsPerMillionElements() { + return (elapsedSeconds * 1000000.0) / Math.max(unitsProcessed, 1); + } + + /** How long in seconds to process 1M bp on the genome? */ + @Ensures("result >= 0.0") + public double secondsPerMillionBP() { + return (elapsedSeconds * 1000000.0) / Math.max(bpProcessed, 1); + } + + /** What fraction of the target intervals have we covered? */ + @Requires("targetSize >= 0") + @Ensures({"result >= 0.0", "result <= 1.0"}) + public double calculateFractionGenomeTargetCompleted(final long targetSize) { + return (1.0*bpProcessed) / Math.max(targetSize, 1); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java new file mode 100644 index 000000000..431014032 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRMode.java @@ -0,0 +1,30 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; + +import java.lang.annotation.*; + +/** + * User: hanna + * Date: May 14, 2009 + * Time: 1:51:22 PM + * BROAD INSTITUTE SOFTWARE COPYRIGHT NOTICE AND AGREEMENT + * Software and documentation are copyright 2005 by the Broad Institute. + * All rights are reserved. + * + * Users acknowledge that this software is supplied without any warranty or support. + * The Broad Institute is not responsible for its use, misuse, or + * functionality. + */ + +/** + * Allows the walker to indicate what type of data it wants to consume. + */ + +@Documented +@Inherited +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface BQSRMode { + public abstract ReadTransformer.ApplicationTime ApplicationTime() default ReadTransformer.ApplicationTime.ON_INPUT; +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java new file mode 100644 index 000000000..fae0e8c09 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRReadTransformer.java @@ -0,0 +1,40 @@ +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +/** + * A ReadTransformer that applies BQSR on the fly to reads + * + * User: rpoplin + * Date: 2/13/12 + */ +public class BQSRReadTransformer extends ReadTransformer { + private boolean enabled; + private BaseRecalibration bqsr; + + @Override + public ApplicationTime initializeSub(final GenomeAnalysisEngine engine, final Walker walker) { + this.enabled = engine.hasBaseRecalibration(); + this.bqsr = engine.getBaseRecalibration(); + final BQSRMode mode = WalkerManager.getWalkerAnnotation(walker, BQSRMode.class); + return mode.ApplicationTime(); + } + + @Override + public boolean enabled() { + return enabled; + } + + /** + * initialize a new BQSRReadTransformer that applies BQSR on the fly to incoming reads. + */ + @Override + public GATKSAMRecord apply(GATKSAMRecord read) { + bqsr.recalibrateRead(read); + return read; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java deleted file mode 100644 index 048f8e58c..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BQSRSamIterator.java +++ /dev/null @@ -1,50 +0,0 @@ -package org.broadinstitute.sting.utils.recalibration; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Requires; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.Iterator; - -/** - * Created by IntelliJ IDEA. - * User: rpoplin - * Date: 2/13/12 - */ - -public class BQSRSamIterator implements StingSAMIterator { - private final StingSAMIterator it; - private final BaseRecalibration bqsr; - - /** - * Creates a new BQSRSamIterator and applies BQSR on the fly to incoming reads. - * - * @param it The incoming SamIterator to wrap - * @param bqsr The object which holds the BQSR table information and knows how to apply it - */ - @Requires({ - "it != null", - "bqsr != null"}) - public BQSRSamIterator(StingSAMIterator it, BaseRecalibration bqsr) { - if ( bqsr == null ) throw new ReviewedStingException("BUG: shouldn't create BQSRSamIterator with null recalibration object"); - - this.it = it; - this.bqsr = bqsr; - } - - @Requires("hasNext()") - @Ensures("result != null") - public SAMRecord next() { - SAMRecord read = it.next(); - bqsr.recalibrateRead((GATKSAMRecord) read); - return read; - } - - public boolean hasNext() { return this.it.hasNext(); } - public void remove() { throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!"); } - public void close() { it.close(); } - public Iterator iterator() { return this; } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index c09eb0063..5d4020a07 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -27,12 +27,11 @@ package org.broadinstitute.sting.utils.recalibration; import net.sf.samtools.SAMTag; import net.sf.samtools.SAMUtils; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.collections.NestedHashMap; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; @@ -46,21 +45,22 @@ import java.io.File; public class BaseRecalibration { private final static int MAXIMUM_RECALIBRATED_READ_LENGTH = 5000; - private final ReadCovariates readCovariates; - private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) + private final QuantizationInfo quantizationInfo; // histogram containing the map for qual quantization (calculated after recalibration is done) private final RecalibrationTables recalibrationTables; - private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation private final boolean disableIndelQuals; private final int preserveQLessThan; private final boolean emitOriginalQuals; - private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. - static { - for (int i = 0; i < EventType.values().length; i++) - qualityScoreByFullCovariateKey[i] = new NestedHashMap(); - } + // TODO -- was this supposed to be used somewhere? +// private static final NestedHashMap[] qualityScoreByFullCovariateKey = new NestedHashMap[EventType.values().length]; // Caches the result of performSequentialQualityCalculation(..) for all sets of covariate values. +// static { +// for (int i = 0; i < EventType.values().length; i++) +// qualityScoreByFullCovariateKey[i] = new NestedHashMap(); +// } + /** * Constructor using a GATK Report file @@ -76,12 +76,11 @@ public class BaseRecalibration { recalibrationTables = recalibrationReport.getRecalibrationTables(); requestedCovariates = recalibrationReport.getRequestedCovariates(); quantizationInfo = recalibrationReport.getQuantizationInfo(); - if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores + if (quantizationLevels == 0) // quantizationLevels == 0 means no quantization, preserve the quality scores quantizationInfo.noQuantization(); - else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. + else if (quantizationLevels > 0 && quantizationLevels != quantizationInfo.getQuantizationLevels()) // any other positive value means, we want a different quantization than the one pre-calculated in the recalibration report. Negative values mean the user did not provide a quantization argument, and just wnats to use what's in the report. quantizationInfo.quantizeQualityScores(quantizationLevels); - readCovariates = new ReadCovariates(MAXIMUM_RECALIBRATED_READ_LENGTH, requestedCovariates.length); this.disableIndelQuals = disableIndelQuals; this.preserveQLessThan = preserveQLessThan; this.emitOriginalQuals = emitOriginalQuals; @@ -103,24 +102,25 @@ public class BaseRecalibration { } } - RecalUtils.computeCovariates(read, requestedCovariates, readCovariates); // compute all covariates for the read - for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings + final ReadCovariates readCovariates = RecalUtils.computeCovariates(read, requestedCovariates); + + for (final EventType errorModel : EventType.values()) { // recalibrate all three quality strings if (disableIndelQuals && errorModel != EventType.BASE_SUBSTITUTION) { read.setBaseQualities(null, errorModel); continue; } final byte[] quals = read.getBaseQualities(errorModel); - final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); // get the keyset for this base using the error model + final int[][] fullReadKeySet = readCovariates.getKeySet(errorModel); // get the keyset for this base using the error model final int readLength = read.getReadLength(); - for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read + for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read final byte originalQualityScore = quals[offset]; - if (originalQualityScore >= preserveQLessThan) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) - final int[] keySet = fullReadKeySet[offset]; // get the keyset for this base using the error model - final byte recalibratedQualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base + if (originalQualityScore >= preserveQLessThan) { // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + final int[] keySet = fullReadKeySet[offset]; // get the keyset for this base using the error model + final byte recalibratedQualityScore = performSequentialQualityCalculation(keySet, errorModel); // recalibrate the base quals[offset] = recalibratedQualityScore; } } @@ -128,6 +128,7 @@ public class BaseRecalibration { } } + /** * Implements a serial recalibration of the reads using the combinational table. * First, we perform a positional recalibration, and then a subsequent dinuc correction. @@ -145,17 +146,17 @@ public class BaseRecalibration { * @param errorModel the event type * @return A recalibrated quality score as a byte */ - protected byte performSequentialQualityCalculation(final int[] key, final EventType errorModel) { + private byte performSequentialQualityCalculation(final int[] key, final EventType errorModel) { final byte qualFromRead = (byte)(long)key[1]; - final double globalDeltaQ = calculateGlobalDeltaQ(recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE), key, errorModel); - final double deltaQReported = calculateDeltaQReported(recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE), key, errorModel, globalDeltaQ, qualFromRead); + final double globalDeltaQ = calculateGlobalDeltaQ(recalibrationTables.getReadGroupTable(), key, errorModel); + final double deltaQReported = calculateDeltaQReported(recalibrationTables.getQualityScoreTable(), key, errorModel, globalDeltaQ, qualFromRead); final double deltaQCovariates = calculateDeltaQCovariates(recalibrationTables, key, errorModel, globalDeltaQ, deltaQReported, qualFromRead); - double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula - recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQual), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL + double recalibratedQual = qualFromRead + globalDeltaQ + deltaQReported + deltaQCovariates; // calculate the recalibrated qual using the BQSR formula + recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQual), QualityUtils.MAX_RECALIBRATED_Q_SCORE); // recalibrated quality is bound between 1 and MAX_QUAL - return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality + return quantizationInfo.getQuantizedQuals().get((int) recalibratedQual); // return the quantized version of the recalibrated quality } private double calculateGlobalDeltaQ(final NestedIntegerArray table, final int[] key, final EventType errorModel) { diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java index 2b67d12a9..f3644fdd8 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java @@ -30,18 +30,18 @@ public class QuantizationInfo { } public QuantizationInfo(final RecalibrationTables recalibrationTables, final int quantizationLevels) { - final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution + final Long [] qualHistogram = new Long[QualityUtils.MAX_QUAL_SCORE+1]; // create a histogram with the empirical quality distribution for (int i = 0; i < qualHistogram.length; i++) qualHistogram[i] = 0L; - final NestedIntegerArray qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); // get the quality score table + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); // get the quality score table for (final RecalDatum value : qualTable.getAllValues()) { final RecalDatum datum = value; - final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) - qualHistogram[empiricalQual] += datum.getNumObservations(); // add the number of observations for every key + final int empiricalQual = MathUtils.fastRound(datum.getEmpiricalQuality()); // convert the empirical quality to an integer ( it is already capped by MAX_QUAL ) + qualHistogram[empiricalQual] += (long) datum.getNumObservations(); // add the number of observations for every key } - empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities + empiricalQualCounts = Arrays.asList(qualHistogram); // histogram with the number of observations of the empirical qualities quantizeQualityScores(quantizationLevels); this.quantizationLevels = quantizationLevels; @@ -49,8 +49,8 @@ public class QuantizationInfo { public void quantizeQualityScores(int nLevels) { - QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels - quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) + QualQuantizer quantizer = new QualQuantizer(empiricalQualCounts, nLevels, QualityUtils.MIN_USABLE_Q_SCORE); // quantize the qualities to the desired number of levels + quantizedQuals = quantizer.getOriginalToQuantizedMap(); // map with the original to quantized qual map (using the standard number of levels in the RAC) } public void noQuantization() { diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java index c86bd4deb..2b682f84b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -1,5 +1,7 @@ package org.broadinstitute.sting.utils.recalibration; +import java.util.Arrays; + /** * The object temporarily held by a read that describes all of it's covariates. * @@ -21,6 +23,17 @@ public class ReadCovariates { currentCovariateIndex = index; } + /** + * Necessary due to bug in BaseRecalibration recalibrateRead function. It is clearly seeing space it's not supposed to + * @return + */ + public ReadCovariates clear() { + for ( int i = 0; i < keys.length; i++ ) + for ( int j = 0; j < keys[i].length; j++) + Arrays.fill(keys[i][j], 0); + return this; + } + public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 8c8815b54..e3348d3de 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -28,7 +28,6 @@ package org.broadinstitute.sting.utils.recalibration; import com.google.java.contract.Ensures; import com.google.java.contract.Invariant; import com.google.java.contract.Requires; -import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import java.util.Random; @@ -68,12 +67,12 @@ public class RecalDatum { /** * number of bases seen in total */ - private long numObservations; + private double numObservations; /** * number of bases seen that didn't match the reference */ - private long numMismatches; + private double numMismatches; /** * used when calculating empirical qualities to avoid division by zero @@ -93,7 +92,7 @@ public class RecalDatum { * @param _numMismatches * @param reportedQuality */ - public RecalDatum(final long _numObservations, final long _numMismatches, final byte reportedQuality) { + public RecalDatum(final double _numObservations, final double _numMismatches, final byte reportedQuality) { if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); if ( _numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0"); @@ -167,9 +166,9 @@ public class RecalDatum { return 0.0; else { // cache the value so we don't call log over and over again - final double doubleMismatches = (double) (numMismatches + SMOOTHING_CONSTANT); + final double doubleMismatches = numMismatches + SMOOTHING_CONSTANT; // smoothing is one error and one non-error observation, for example - final double doubleObservations = (double) (numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT); + final double doubleObservations = numObservations + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; return doubleMismatches / doubleObservations; } } @@ -200,11 +199,11 @@ public class RecalDatum { @Override public String toString() { - return String.format("%d,%d,%d", getNumObservations(), getNumMismatches(), (byte) Math.floor(getEmpiricalQuality())); + return String.format("%.2f,%,2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); } public String stringForCSV() { - return String.format("%s,%d,%.2f", toString(), (byte) Math.floor(getEstimatedQReported()), getEmpiricalQuality() - getEstimatedQReported()); + return String.format("%s,%.2f,%.2f", toString(), getEstimatedQReported(), getEmpiricalQuality() - getEstimatedQReported()); } // /** @@ -229,42 +228,42 @@ public class RecalDatum { // //--------------------------------------------------------------------------------------------------------------- - public long getNumObservations() { + public double getNumObservations() { return numObservations; } - public synchronized void setNumObservations(final long numObservations) { + public synchronized void setNumObservations(final double numObservations) { if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); this.numObservations = numObservations; empiricalQuality = UNINITIALIZED; } - public long getNumMismatches() { + public double getNumMismatches() { return numMismatches; } @Requires({"numMismatches >= 0"}) - public synchronized void setNumMismatches(final long numMismatches) { + public synchronized void setNumMismatches(final double numMismatches) { if ( numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); this.numMismatches = numMismatches; empiricalQuality = UNINITIALIZED; } @Requires({"by >= 0"}) - public synchronized void incrementNumObservations(final long by) { + public synchronized void incrementNumObservations(final double by) { numObservations += by; empiricalQuality = UNINITIALIZED; } @Requires({"by >= 0"}) - public synchronized void incrementNumMismatches(final long by) { + public synchronized void incrementNumMismatches(final double by) { numMismatches += by; empiricalQuality = UNINITIALIZED; } @Requires({"incObservations >= 0", "incMismatches >= 0"}) @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) - public synchronized void increment(final long incObservations, final long incMismatches) { + public synchronized void increment(final double incObservations, final double incMismatches) { incrementNumObservations(incObservations); incrementNumMismatches(incMismatches); } @@ -300,6 +299,6 @@ public class RecalDatum { */ @Ensures("result >= 0.0") private double calcExpectedErrors() { - return (double) getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); + return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java index 41e96222c..6c94c3c42 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -263,14 +263,14 @@ public class RecalDatumNode { int i = 0; for ( final RecalDatumNode subnode : subnodes ) { // use the yates correction to help avoid all zeros => NaN - counts[i][0] = subnode.getRecalDatum().getNumMismatches() + 1; - counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2; + counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L; + counts[i][1] = Math.round(subnode.getRecalDatum().getNumObservations()) + 2L; i++; } try { final double chi2PValue = new ChiSquareTestImpl().chiSquareTest(counts); - final double penalty = -10 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE)); + final double penalty = -10.0 * Math.log10(Math.max(chi2PValue, SMALLEST_CHI2_PVALUE)); // make sure things are reasonable and fail early if not if (Double.isInfinite(penalty) || Double.isNaN(penalty)) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 8a9143c89..7e90d98b9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -47,7 +47,7 @@ import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.ReadUtils; import java.io.File; -import java.io.FileNotFoundException; +import java.io.IOException; import java.io.PrintStream; import java.util.*; @@ -68,6 +68,7 @@ public class RecalUtils { public final static String QUALITY_SCORE_REPORT_TABLE_TITLE = "RecalTable1"; public final static String ALL_COVARIATES_REPORT_TABLE_TITLE = "RecalTable2"; + public final static String ARGUMENT_COLUMN_NAME = "Argument"; public final static String ARGUMENT_VALUE_COLUMN_NAME = "Value"; public final static String QUANTIZED_VALUE_COLUMN_NAME = "QuantizedScore"; public static final String QUANTIZED_COUNT_COLUMN_NAME = "Count"; @@ -81,8 +82,8 @@ public class RecalUtils { public final static String NUMBER_OBSERVATIONS_COLUMN_NAME = "Observations"; public final static String NUMBER_ERRORS_COLUMN_NAME = "Errors"; - private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams - private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color + private final static String COLOR_SPACE_ATTRIBUTE_TAG = "CS"; // The tag that holds the color space for SOLID bams + private final static String COLOR_SPACE_INCONSISTENCY_TAG = "ZC"; // A new tag made up for the recalibrator which will hold an array of ints which say if this base is inconsistent with its color private static boolean warnUserNullPlatform = false; private static final String SCRIPT_FILE = "BQSR.R"; @@ -92,8 +93,8 @@ public class RecalUtils { private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); - private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); - private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%d"); + private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%.2f"); + private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f"); /** * Generates two lists : required covariates and optional covariates based on the user's requests. @@ -111,12 +112,13 @@ public class RecalUtils { final List> requiredClasses = new PluginManager(RequiredCovariate.class).getPlugins(); final List> standardClasses = new PluginManager(StandardCovariate.class).getPlugins(); - final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates + final ArrayList requiredCovariates = addRequiredCovariatesToList(requiredClasses); // add the required covariates ArrayList optionalCovariates = new ArrayList(); if (!argumentCollection.DO_NOT_USE_STANDARD_COVARIATES) - optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user + optionalCovariates = addStandardCovariatesToList(standardClasses); // add the standard covariates if -standard was specified by the user - if (argumentCollection.COVARIATES != null) { // parse the -cov arguments that were provided, skipping over the ones already specified + // parse the -cov arguments that were provided, skipping over the ones already specified + if (argumentCollection.COVARIATES != null) { for (String requestedCovariateString : argumentCollection.COVARIATES) { // help the transition from BQSR v1 to BQSR v2 if ( requestedCovariateString.equals("DinucCovariate") ) @@ -126,12 +128,12 @@ public class RecalUtils { boolean foundClass = false; for (Class covClass : covariateClasses) { - if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class + if (requestedCovariateString.equalsIgnoreCase(covClass.getSimpleName())) { // -cov argument matches the class name for an implementing class foundClass = true; if (!requiredClasses.contains(covClass) && (argumentCollection.DO_NOT_USE_STANDARD_COVARIATES || !standardClasses.contains(covClass))) { try { - final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it + final Covariate covariate = covClass.newInstance(); // now that we've found a matching class, try to instantiate it optionalCovariates.add(covariate); } catch (Exception e) { throw new DynamicClassResolutionException(covClass, e); @@ -161,7 +163,7 @@ public class RecalUtils { if (classes.size() != 2) throw new ReviewedStingException("The number of required covariates has changed, this is a hard change in the code and needs to be inspected"); - dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. + dest.add(new ReadGroupCovariate()); // enforce the order with RG first and QS next. dest.add(new QualityScoreCovariate()); return dest; } @@ -266,20 +268,20 @@ public class RecalUtils { for (int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++) { - final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future + final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.index) { - columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future + columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index) { columnNames.add(covariateValue); columnNames.add(covariateName); } } - columnNames.add(eventType); // the order of these column names is important here + columnNames.add(eventType); // the order of these column names is important here columnNames.add(empiricalQuality); if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.index) - columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported + columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported columnNames.add(nObservations); columnNames.add(nErrors); @@ -288,7 +290,7 @@ public class RecalUtils { reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size()); for (final Pair columnName : columnNames) reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); - rowIndex = 0; // reset the row index since we're starting with a new table + rowIndex = 0; // reset the row index since we're starting with a new table } else { reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index); } @@ -316,7 +318,7 @@ public class RecalUtils { reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.index) - reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table + reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); @@ -332,8 +334,8 @@ public class RecalUtils { return covariate.getClass().getSimpleName().split("Covariate")[0]; } - public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) { - outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile); + public static void outputRecalibrationReport(final RecalibrationArgumentCollection RAC, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates) { + outputRecalibrationReport(RAC.generateReportTable(covariateNames(requestedCovariates)), quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), RAC.RECAL_TABLE); } /** @@ -349,7 +351,6 @@ public class RecalUtils { return Utils.join(",", names); } - public static void outputRecalibrationReport(final GATKReportTable argumentTable, final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final PrintStream outputFile) { outputRecalibrationReport(argumentTable, quantizationInfo.generateReportTable(), generateReportTables(recalibrationTables, requestedCovariates), outputFile); } @@ -362,46 +363,36 @@ public class RecalUtils { report.print(outputFile); } - private static Pair initializeRecalibrationPlot(File filename) { - final PrintStream deltaTableStream; - final File deltaTableFileName = new File(filename + ".csv"); - try { - deltaTableStream = new PrintStream(deltaTableFileName); - } catch (FileNotFoundException e) { - throw new UserException.CouldNotCreateOutputFile(deltaTableFileName, "File " + deltaTableFileName + " could not be created"); - } - return new Pair(deltaTableStream, deltaTableFileName); - } - - private static void outputRecalibrationPlot(final File gatkReportFilename, Pair files, boolean keepIntermediates) { - final File csvFileName = files.getSecond(); - final File plotFileName = new File(csvFileName + ".pdf"); - files.getFirst().close(); + private static void outputRecalibrationPlot(final RecalibrationArgumentCollection RAC) { final RScriptExecutor executor = new RScriptExecutor(); executor.addScript(new Resource(SCRIPT_FILE, RecalUtils.class)); - executor.addArgs(csvFileName.getAbsolutePath()); - executor.addArgs(gatkReportFilename.getAbsolutePath()); - executor.addArgs(plotFileName.getAbsolutePath()); + executor.addArgs(RAC.RECAL_CSV_FILE.getAbsolutePath()); + executor.addArgs(RAC.RECAL_TABLE_FILE.getAbsolutePath()); + executor.addArgs(RAC.RECAL_PDF_FILE.getAbsolutePath()); executor.exec(); - - if (!keepIntermediates) - if (!csvFileName.delete()) - throw new ReviewedStingException("Could not find file " + csvFileName.getAbsolutePath()); - } - public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final Covariate[] requestedCovariates, final boolean keepIntermediates) { - final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, true); - outputRecalibrationPlot(filename, files, keepIntermediates); + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final Covariate[] requestedCovariates) { + generateRecalibrationPlot(RAC, original, null, requestedCovariates); } - public static void generateRecalibrationPlot(final File filename, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates, final boolean keepIntermediates) { - final Pair files = initializeRecalibrationPlot(filename); - writeCSV(files.getFirst(), recalibrated, "RECALIBRATED", requestedCovariates, true); - writeCSV(files.getFirst(), original, "ORIGINAL", requestedCovariates, false); - outputRecalibrationPlot(filename, files, keepIntermediates); + public static void generateRecalibrationPlot(final RecalibrationArgumentCollection RAC, final RecalibrationTables original, final RecalibrationTables recalibrated, final Covariate[] requestedCovariates) { + final PrintStream csvFile; + try { + if ( RAC.RECAL_CSV_FILE == null ) { + RAC.RECAL_CSV_FILE = File.createTempFile("BQSR", ".csv"); + RAC.RECAL_CSV_FILE.deleteOnExit(); + } + csvFile = new PrintStream(RAC.RECAL_CSV_FILE); + } catch (IOException e) { + throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_CSV_FILE, e); + } + + if ( recalibrated != null ) + writeCSV(csvFile, recalibrated, "RECALIBRATED", requestedCovariates, true); + writeCSV(csvFile, original, "ORIGINAL", requestedCovariates, recalibrated == null); + outputRecalibrationPlot(RAC); } private static void writeCSV(final PrintStream deltaTableFile, final RecalibrationTables recalibrationTables, final String recalibrationMode, final Covariate[] requestedCovariates, final boolean printHeader) { @@ -409,14 +400,14 @@ public class RecalUtils { final NestedHashMap deltaTable = new NestedHashMap(); // add the quality score table to the delta table - final NestedIntegerArray qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); - for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); + for (final NestedIntegerArray.Leaf leaf : qualTable.getAllLeaves()) { // go through every element in the covariates table to create the delta table final int[] newCovs = new int[4]; newCovs[0] = leaf.keys[0]; - newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore + newCovs[1] = requestedCovariates.length; // replace the covariate name with an arbitrary (unused) index for QualityScore newCovs[2] = leaf.keys[1]; newCovs[3] = leaf.keys[2]; - addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table + addToDeltaTable(deltaTable, newCovs, (RecalDatum)leaf.value); // add this covariate to the delta table } // add the optional covariates to the delta table @@ -425,10 +416,10 @@ public class RecalUtils { for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) { final int[] covs = new int[4]; covs[0] = leaf.keys[0]; - covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) + covs[1] = i; // reset the quality score covariate to 0 from the keyset (so we aggregate all rows regardless of QS) covs[2] = leaf.keys[2]; covs[3] = leaf.keys[3]; - addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table + addToDeltaTable(deltaTable, covs, (RecalDatum) leaf.value); // add this covariate to the delta table } } @@ -486,11 +477,11 @@ public class RecalUtils { */ private static void addToDeltaTable(final NestedHashMap deltaTable, final int[] deltaKey, final RecalDatum recalDatum) { Object[] wrappedKey = wrapKeys(deltaKey); - final RecalDatum deltaDatum = (RecalDatum)deltaTable.get(wrappedKey); // check if we already have a RecalDatum for this key + final RecalDatum deltaDatum = (RecalDatum)deltaTable.get(wrappedKey); // check if we already have a RecalDatum for this key if (deltaDatum == null) - deltaTable.put(new RecalDatum(recalDatum), wrappedKey); // if we don't have a key yet, create a new one with the same values as the curent datum + deltaTable.put(new RecalDatum(recalDatum), wrappedKey); // if we don't have a key yet, create a new one with the same values as the curent datum else - deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one. + deltaDatum.combine(recalDatum); // if we do have a datum, combine it with this one. } private static Object[] wrapKeys(final int[] keys) { @@ -539,10 +530,11 @@ public class RecalUtils { * @return true if this read is consistent or false if this read should be skipped */ public static boolean isColorSpaceConsistent(final SOLID_NOCALL_STRATEGY strategy, final GATKSAMRecord read) { - if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base + if (!ReadUtils.isSOLiDRead(read)) // If this is a SOLID read then we have to check if the color space is inconsistent. This is our only sign that SOLID has inserted the reference base return true; - if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { // Haven't calculated the inconsistency array yet for this read + // Haven't calculated the inconsistency array yet for this read + if (read.getAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG) == null) { final Object attr = read.getAttribute(RecalUtils.COLOR_SPACE_ATTRIBUTE_TAG); if (attr != null) { byte[] colorSpace; @@ -562,13 +554,13 @@ public class RecalUtils { } } - byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read + byte[] readBases = read.getReadBases(); // Loop over the read and calculate first the inferred bases from the color and then check if it is consistent with the read if (read.getReadNegativeStrandFlag()) readBases = BaseUtils.simpleReverseComplement(read.getReadBases()); final byte[] inconsistency = new byte[readBases.length]; int i; - byte prevBase = colorSpace[0]; // The sentinel + byte prevBase = colorSpace[0]; // The sentinel for (i = 0; i < readBases.length; i++) { final byte thisBase = getNextBaseFromColor(read, prevBase, colorSpace[i + 1]); inconsistency[i] = (byte) (thisBase == readBases[i] ? 0 : 1); @@ -576,11 +568,11 @@ public class RecalUtils { } read.setAttribute(RecalUtils.COLOR_SPACE_INCONSISTENCY_TAG, inconsistency); } - else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it + else if (strategy == SOLID_NOCALL_STRATEGY.THROW_EXCEPTION) // if the strategy calls for an exception, throw it throw new UserException.MalformedBAM(read, "Unable to find color space information in SOLiD read. First observed at read with name = " + read.getReadName() + " Unfortunately this .bam file can not be recalibrated without color space information because of potential reference bias."); else - return false; // otherwise, just skip the read + return false; // otherwise, just skip the read } return true; @@ -774,6 +766,4 @@ public class RecalUtils { return base; } } - - } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index e6ab9e38b..527306c85 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -2,11 +2,12 @@ package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; -import org.broadinstitute.sting.gatk.walkers.bqsr.*; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import java.io.File; import java.io.PrintStream; @@ -19,13 +20,13 @@ import java.util.*; * @since 3/26/12 */ public class RecalibrationReport { - private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) - private final RecalibrationTables recalibrationTables; // quick access reference to the tables - private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation + private QuantizationInfo quantizationInfo; // histogram containing the counts for qual quantization (calculated after recalibration is done) + private final RecalibrationTables recalibrationTables; // quick access reference to the tables + private final Covariate[] requestedCovariates; // list of all covariates to be used in this calculation private final HashMap optionalCovariateIndexes; - private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes - private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter + private final GATKReportTable argumentTable; // keep the argument table untouched just for output purposes + private final RecalibrationArgumentCollection RAC; // necessary for quantizing qualities with the same parameter private final int[] tempRGarray = new int[2]; private final int[] tempQUALarray = new int[3]; @@ -40,7 +41,7 @@ public class RecalibrationReport { GATKReportTable quantizedTable = report.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE); quantizationInfo = initializeQuantizationTable(quantizedTable); - Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates + Pair, ArrayList> covariates = RecalUtils.initializeCovariates(RAC); // initialize the required and optional covariates ArrayList requiredCovariates = covariates.getFirst(); ArrayList optionalCovariates = covariates.getSecond(); requestedCovariates = new Covariate[requiredCovariates.size() + optionalCovariates.size()]; @@ -50,19 +51,19 @@ public class RecalibrationReport { requestedCovariates[covariateIndex++] = covariate; for (final Covariate covariate : optionalCovariates) { requestedCovariates[covariateIndex] = covariate; - final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport + final String covariateName = covariate.getClass().getSimpleName().split("Covariate")[0]; // get the name of the covariate (without the "covariate" part of it) so we can match with the GATKReport optionalCovariateIndexes.put(covariateName, covariateIndex-2); covariateIndex++; } for (Covariate cov : requestedCovariates) - cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection + cov.initialize(RAC); // initialize any covariate member variables using the shared argument collection recalibrationTables = new RecalibrationTables(requestedCovariates, countReadGroups(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE))); - parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE)); + parseReadGroupTable(report.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE), recalibrationTables.getReadGroupTable()); - parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE)); + parseQualityScoreTable(report.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE), recalibrationTables.getQualityScoreTable()); parseAllCovariatesTable(report.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE), recalibrationTables); @@ -105,9 +106,9 @@ public class RecalibrationReport { */ public void combine(final RecalibrationReport other) { - for (RecalibrationTables.TableType type : RecalibrationTables.TableType.values()) { - final NestedIntegerArray myTable = recalibrationTables.getTable(type); - final NestedIntegerArray otherTable = other.recalibrationTables.getTable(type); + for ( int tableIndex = 0; tableIndex < recalibrationTables.numTables(); tableIndex++ ) { + final NestedIntegerArray myTable = recalibrationTables.getTable(tableIndex); + final NestedIntegerArray otherTable = other.recalibrationTables.getTable(tableIndex); for (final NestedIntegerArray.Leaf row : otherTable.getAllLeaves()) { final RecalDatum myDatum = myTable.get(row.keys); @@ -193,14 +194,26 @@ public class RecalibrationReport { } } + private double asDouble(final Object o) { + if ( o instanceof Double ) + return (Double)o; + else if ( o instanceof Integer ) + return (Integer)o; + else if ( o instanceof Long ) + return (Long)o; + else + throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass()); + } + private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { - final long nObservations = (Long) reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME); - final long nErrors = (Long) reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); + final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); + final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); final double empiricalQuality = (Double) reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME); - final double estimatedQReported = hasEstimatedQReportedColumn ? // the estimatedQreported column only exists in the ReadGroup table - (Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table - Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table + // the estimatedQreported column only exists in the ReadGroup table + final double estimatedQReported = hasEstimatedQReportedColumn ? + (Double) reportTable.get(row, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME) : // we get it if we are in the read group table + Byte.parseByte((String) reportTable.get(row, RecalUtils.QUALITY_SCORE_COLUMN_NAME)); // or we use the reported quality if we are in any other table final RecalDatum datum = new RecalDatum(nObservations, nErrors, (byte)1); datum.setEstimatedQReported(estimatedQReported); @@ -242,7 +255,7 @@ public class RecalibrationReport { final String argument = table.get(i, "Argument").toString(); Object value = table.get(i, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); if (value.equals("null")) - value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport + value = null; // generic translation of null values that were printed out as strings | todo -- add this capability to the GATKReport if (argument.equals("covariate") && value != null) RAC.COVARIATES = value.toString().split(","); @@ -283,14 +296,11 @@ public class RecalibrationReport { else if (argument.equals("quantizing_levels")) RAC.QUANTIZING_LEVELS = Integer.parseInt((String) value); - else if (argument.equals("keep_intermediate_files")) - RAC.KEEP_INTERMEDIATE_FILES = Boolean.parseBoolean((String) value); - - else if (argument.equals("no_plots")) - RAC.NO_PLOTS = Boolean.parseBoolean((String) value); - else if (argument.equals("recalibration_report")) - RAC.recalibrationReport = (value == null) ? null : new File((String) value); + RAC.existingRecalibrationReport = (value == null) ? null : new File((String) value); + + else if (argument.equals("plot_pdf_file")) + RAC.RECAL_PDF_FILE = (value == null) ? null : new File((String) value); else if (argument.equals("binary_tag_name")) RAC.BINARY_TAG_NAME = (value == null) ? null : (String) value; diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index f37e69c9a..0dd510245 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -25,9 +25,12 @@ package org.broadinstitute.sting.utils.recalibration; +import org.broadinstitute.sting.utils.collections.LoggingNestedIntegerArray; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import java.io.PrintStream; + /** * Utility class to facilitate on-the-fly base quality score recalibration. * @@ -52,23 +55,39 @@ public class RecalibrationTables { private final NestedIntegerArray[] tables; public RecalibrationTables(final Covariate[] covariates) { - this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1); + this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, null); + } + + public RecalibrationTables(final Covariate[] covariates, final PrintStream log) { + this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, log); } public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) { + this(covariates, numReadGroups, null); + } + + public RecalibrationTables(final Covariate[] covariates, final int numReadGroups, final PrintStream log) { tables = new NestedIntegerArray[covariates.length]; final int qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1; final int eventDimension = EventType.values().length; - tables[TableType.READ_GROUP_TABLE.index] = new NestedIntegerArray(numReadGroups, eventDimension); - tables[TableType.QUALITY_SCORE_TABLE.index] = new NestedIntegerArray(numReadGroups, qualDimension, eventDimension); + tables[TableType.READ_GROUP_TABLE.index] = log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : + new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension); + tables[TableType.QUALITY_SCORE_TABLE.index] = log == null ? new NestedIntegerArray(numReadGroups, qualDimension, eventDimension) : + new LoggingNestedIntegerArray(log, "QUALITY_SCORE_TABLE", numReadGroups, qualDimension, eventDimension); for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < covariates.length; i++) - tables[i] = new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension); + tables[i] = log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : + new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1), + numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension); } - public NestedIntegerArray getTable(final TableType type) { - return (NestedIntegerArray)tables[type.index]; + public NestedIntegerArray getReadGroupTable() { + return (NestedIntegerArray)tables[TableType.READ_GROUP_TABLE.index]; + } + + public NestedIntegerArray getQualityScoreTable() { + return (NestedIntegerArray)tables[TableType.QUALITY_SCORE_TABLE.index]; } public NestedIntegerArray getTable(final int index) { diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java index 570944245..5e470b35f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ContextCovariate.java @@ -87,7 +87,8 @@ public class ContextCovariate implements StandardCovariate { // store the original bases and then write Ns over low quality ones final byte[] originalBases = read.getReadBases().clone(); - final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); // Write N's over the low quality tail of the reads to avoid adding them into the context + // Write N's over the low quality tail of the reads to avoid adding them into the context + final GATKSAMRecord clippedRead = ReadClipper.clipLowQualEnds(read, LOW_QUAL_TAIL, ClippingRepresentation.WRITE_NS); final boolean negativeStrand = clippedRead.getReadNegativeStrandFlag(); byte[] bases = clippedRead.getReadBases(); @@ -115,7 +116,7 @@ public class ContextCovariate implements StandardCovariate { @Override public String formatKey(final int key) { - if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file + if (key == -1) // this can only happen in test routines because we do not propagate null keys to the csv file return null; return contextFromKey(key); @@ -176,9 +177,9 @@ public class ContextCovariate implements StandardCovariate { for (int currentIndex = contextSize; currentIndex < readLength; currentIndex++) { final int baseIndex = BaseUtils.simpleBaseToBaseIndex(bases[currentIndex]); - if (baseIndex == -1) { // ignore non-ACGT bases + if (baseIndex == -1) { // ignore non-ACGT bases currentNPenalty = contextSize; - currentKey = 0; // reset the key + currentKey = 0; // reset the key } else { // push this base's contribution onto the key: shift everything 2 bits, mask out the non-context bits, and add the new base and the length in currentKey = (currentKey >> 2) & mask; @@ -215,7 +216,7 @@ public class ContextCovariate implements StandardCovariate { int bitOffset = LENGTH_BITS; for (int i = start; i < end; i++) { final int baseIndex = BaseUtils.simpleBaseToBaseIndex(dna[i]); - if (baseIndex == -1) // ignore non-ACGT bases + if (baseIndex == -1) // ignore non-ACGT bases return -1; key |= (baseIndex << bitOffset); bitOffset += 2; @@ -233,15 +234,15 @@ public class ContextCovariate implements StandardCovariate { if (key < 0) throw new ReviewedStingException("dna conversion cannot handle negative numbers. Possible overflow?"); - final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context - int mask = 48; // use the mask to pull out bases + final int length = key & LENGTH_MASK; // the first bits represent the length (in bp) of the context + int mask = 48; // use the mask to pull out bases int offset = LENGTH_BITS; StringBuilder dna = new StringBuilder(); for (int i = 0; i < length; i++) { final int baseIndex = (key & mask) >> offset; dna.append((char)BaseUtils.baseIndexToSimpleBase(baseIndex)); - mask = mask << 2; // move the mask over to the next 2 bits + mask = mask << 2; // move the mask over to the next 2 bits offset += 2; } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java index cdf12d284..5d0d94b69 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java @@ -108,7 +108,7 @@ public class CycleCovariate implements StandardCovariate { // the current sequential model would consider the effects independently instead of jointly. final boolean multiplyByNegative1 = read.getReadPairedFlag() && read.getSecondOfPairFlag(); - int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. + int cycle = multiplyByNegative1 ? -1 : 1; // todo -- check if this is the right behavior for mate paired reads in flow cycle platforms. // BUGBUG: Consider looking at degradation of base quality scores in homopolymer runs to detect when the cycle incremented even though the nucleotide didn't change // For example, AAAAAAA was probably read in two flow cycles but here we count it as one @@ -201,9 +201,9 @@ public class CycleCovariate implements StandardCovariate { @Override public String formatKey(final int key) { - int cycle = key >> 1; // shift so we can remove the "sign" bit - if ( (key & 1) != 0 ) // is the last bit set? - cycle *= -1; // then the cycle is negative + int cycle = key >> 1; // shift so we can remove the "sign" bit + if ( (key & 1) != 0 ) // is the last bit set? + cycle *= -1; // then the cycle is negative return String.format("%d", cycle); } @@ -222,7 +222,7 @@ public class CycleCovariate implements StandardCovariate { int result = Math.abs(cycle); result = result << 1; // shift so we can add the "sign" bit if ( cycle < 0 ) - result++; // negative cycles get the lower-most bit set + result++; // negative cycles get the lower-most bit set return result; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java index 85568dac9..29c15adf7 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/ReadGroupCovariate.java @@ -66,7 +66,9 @@ public class ReadGroupCovariate implements RequiredCovariate { } @Override - public String formatKey(final int key) { + public synchronized String formatKey(final int key) { + // This method is synchronized so that we don't attempt to do a get() + // from the reverse lookup table while that table is being updated return readGroupReverseLookupTable.get(key); } @@ -76,17 +78,32 @@ public class ReadGroupCovariate implements RequiredCovariate { } private int keyForReadGroup(final String readGroupId) { - if (!readGroupLookupTable.containsKey(readGroupId)) { - readGroupLookupTable.put(readGroupId, nextId); - readGroupReverseLookupTable.put(nextId, readGroupId); - nextId++; + // Rather than synchronize this entire method (which would be VERY expensive for walkers like the BQSR), + // synchronize only the table updates. + + // Before entering the synchronized block, check to see if this read group is not in our tables. + // If it's not, either we will have to insert it, OR another thread will insert it first. + // This preliminary check avoids doing any synchronization most of the time. + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + + synchronized ( this ) { + + // Now we need to make sure the key is STILL not there, since another thread may have come along + // and inserted it while we were waiting to enter this synchronized block! + if ( ! readGroupLookupTable.containsKey(readGroupId) ) { + readGroupLookupTable.put(readGroupId, nextId); + readGroupReverseLookupTable.put(nextId, readGroupId); + nextId++; + } + } } return readGroupLookupTable.get(readGroupId); } @Override - public int maximumKeyValue() { + public synchronized int maximumKeyValue() { + // Synchronized so that we don't query table size while the tables are being updated return readGroupLookupTable.size() - 1; } diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java new file mode 100644 index 000000000..7c2d9bfdc --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialMultiSampleReadStream.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.picard.sam.MergingSamRecordIterator; +import net.sf.picard.sam.SamFileHeaderMerger; +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMFileReader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.*; + +/** + * Simple wrapper class that multiplexes multiple ArtificialSingleSampleReadStreams into a single stream of reads + * + * @author David Roazen + */ +public class ArtificialMultiSampleReadStream implements Iterable { + + private Collection perSampleArtificialReadStreams; + private MergingSamRecordIterator mergingIterator; + + public ArtificialMultiSampleReadStream( Collection perSampleArtificialReadStreams ) { + if ( perSampleArtificialReadStreams == null || perSampleArtificialReadStreams.isEmpty() ) { + throw new ReviewedStingException("Can't create an ArtificialMultiSampleReadStream out of 0 ArtificialSingleSampleReadStreams"); + } + + this.perSampleArtificialReadStreams = perSampleArtificialReadStreams; + } + + public Iterator iterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return mergingIterator; + } + + public StingSAMIterator getStingSAMIterator() { + // lazy initialization to prevent reads from being created until they're needed + initialize(); + + return StingSAMIteratorAdapter.adapt(mergingIterator); + } + + private void initialize() { + Collection perSampleSAMReaders = new ArrayList(perSampleArtificialReadStreams.size()); + Collection headers = new ArrayList(perSampleArtificialReadStreams.size()); + + for ( ArtificialSingleSampleReadStream readStream : perSampleArtificialReadStreams ) { + Collection thisStreamReads = readStream.makeReads(); + + SAMFileReader reader = new ArtificialSAMFileReader(readStream.getHeader(), + thisStreamReads.toArray(new SAMRecord[thisStreamReads.size()])); + perSampleSAMReaders.add(reader); + headers.add(reader.getFileHeader()); + } + + SamFileHeaderMerger headerMerger = new SamFileHeaderMerger(SAMFileHeader.SortOrder.coordinate, headers, true); + mergingIterator = new MergingSamRecordIterator(headerMerger, perSampleSAMReaders, true); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java index 475f7de21..9632a687b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialReadsTraversal.java @@ -69,7 +69,7 @@ public class ArtificialReadsTraversal extends TraversalEngine reads; + private SAMFileHeader customHeader = null; + /** * Construct an artificial SAM file reader. + * @param sequenceDictionary sequence dictionary used to initialize our GenomeLocParser * @param reads Reads to use as backing data source. */ public ArtificialSAMFileReader(SAMSequenceDictionary sequenceDictionary,SAMRecord... reads) { @@ -50,6 +53,30 @@ public class ArtificialSAMFileReader extends SAMFileReader { this.reads = Arrays.asList(reads); } + /** + * Construct an artificial SAM file reader with the given SAM file header + * + * @param customHeader Header that should be returned by calls to getFileHeader() on this reader + * @param reads Reads to use as backing data source. + */ + public ArtificialSAMFileReader( SAMFileHeader customHeader, SAMRecord... reads ) { + super(createEmptyInputStream(),true); + + this.customHeader = customHeader; + this.genomeLocParser = new GenomeLocParser(customHeader.getSequenceDictionary()); + this.reads = Arrays.asList(reads); + } + + + @Override + public SAMFileHeader getFileHeader() { + if ( customHeader != null ) { + return customHeader; + } + + return super.getFileHeader(); + } + /** * @{inheritDoc} */ diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java index d0211db07..0859957a3 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSAMUtils.java @@ -276,6 +276,30 @@ public class ArtificialSAMUtils { return Arrays.asList(left, right); } + /** + * Create a collection of identical artificial reads based on the parameters. The cigar string for each + * read will be *M, where * is the length of the read. + * + * Useful for testing things like positional downsampling where you care only about the position and + * number of reads, and not the other attributes. + * + * @param stackSize number of identical reads to create + * @param header the SAM header to associate each read with + * @param name name associated with each read + * @param refIndex the reference index, i.e. what chromosome to associate them with + * @param alignmentStart where to start each alignment + * @param length the length of each read + * + * @return a collection of stackSize reads all sharing the above properties + */ + public static Collection createStackOfIdenticalArtificialReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { + Collection stack = new ArrayList(stackSize); + for ( int i = 1; i <= stackSize; i++ ) { + stack.add(createArtificialRead(header, name, refIndex, alignmentStart, length)); + } + return stack; + } + /** * create an iterator containing the specified read piles * diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java new file mode 100644 index 000000000..a9480692b --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStream.java @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; + +/** + * An artificial stream of reads from a single read group/sample with configurable characteristics + * such as: + * + * -the number of contigs that the reads should be distributed across + * -number of "stacks" of reads sharing the same alignment start position per contig + * -the min/max number of reads in each stack (exact values chosen randomly from this range) + * -the min/max distance between stack start positions (exact values chosen randomly from this range) + * -the min/max length of each read (exact values chosen randomly from this range) + * -the number of unmapped reads + * + * The cigar string for all reads will be *M, where * is the length of the read. + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStream implements Iterable { + private SAMFileHeader header; + private String readGroupID; + private int numContigs; + private int numStacksPerContig; + private int minReadsPerStack; + private int maxReadsPerStack; + private int minDistanceBetweenStacks; + private int maxDistanceBetweenStacks; + private int minReadLength; + private int maxReadLength; + private int numUnmappedReads; + + private static final String READ_GROUP_TAG = "RG"; + + public ArtificialSingleSampleReadStream( SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + this.header = header; + this.readGroupID = readGroupID; + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + this.minReadLength = minReadLength; + this.maxReadLength = maxReadLength; + this.numUnmappedReads = numUnmappedReads; + + validateStreamParameters(); + } + + private void validateStreamParameters() { + if ( header == null || readGroupID == null ) { + throw new ReviewedStingException("null SAMFileHeader or read group ID") ; + } + + if ( header.getReadGroup(readGroupID) == null ) { + throw new ReviewedStingException(String.format("Read group %s not found in SAMFileHeader", readGroupID)); + } + + if ( numContigs < 0 || numStacksPerContig < 0 || minReadsPerStack < 0 || maxReadsPerStack < 0 || + minDistanceBetweenStacks < 0 || maxDistanceBetweenStacks < 0 || minReadLength < 0 || maxReadLength < 0 || + numUnmappedReads < 0 ) { + throw new ReviewedStingException("Read stream parameters must be >= 0"); + } + + if ( (numContigs == 0 && numStacksPerContig != 0) || (numContigs != 0 && numStacksPerContig == 0) ) { + throw new ReviewedStingException("numContigs and numStacksPerContig must either both be > 0, or both be 0"); + } + + if ( minReadsPerStack > maxReadsPerStack ) { + throw new ReviewedStingException("minReadsPerStack > maxReadsPerStack"); + } + + if ( minDistanceBetweenStacks > maxDistanceBetweenStacks ) { + throw new ReviewedStingException("minDistanceBetweenStacks > maxDistanceBetweenStacks"); + } + + if ( minReadLength > maxReadLength ) { + throw new ReviewedStingException("minReadLength > maxReadLength"); + } + } + + public Iterator iterator() { + return makeReads().iterator(); + } + + public StingSAMIterator getStingSAMIterator() { + return StingSAMIteratorAdapter.adapt(iterator()); + } + + public Collection makeReads() { + Collection reads = new ArrayList(numContigs * numStacksPerContig * maxReadsPerStack); + + for ( int contig = 0; contig < numContigs; contig++ ) { + int alignmentStart = 1; + + for ( int stack = 0; stack < numStacksPerContig; stack++ ) { + reads.addAll(makeReadStack(contig, alignmentStart, MathUtils.randomIntegerInRange(minReadsPerStack, maxReadsPerStack))); + alignmentStart += MathUtils.randomIntegerInRange(minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + if ( numUnmappedReads > 0 ) { + reads.addAll(makeReadStack(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, SAMRecord.NO_ALIGNMENT_START, numUnmappedReads)); + } + + return reads; + } + + private Collection makeReadStack( int contig, int alignmentStart, int stackSize ) { + Collection readStack = new ArrayList(stackSize); + + for ( int i = 0; i < stackSize; i++ ) { + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, + "foo", + contig, + alignmentStart, + MathUtils.randomIntegerInRange(minReadLength, maxReadLength)); + read.setAttribute(READ_GROUP_TAG, readGroupID); + readStack.add(read); + } + + return readStack; + } + + public SAMFileHeader getHeader() { + return header; + } + + public String getReadGroupID() { + return readGroupID; + } + + public int getNumContigs() { + return numContigs; + } + + public int getNumStacksPerContig() { + return numStacksPerContig; + } + + public int getMinReadsPerStack() { + return minReadsPerStack; + } + + public int getMaxReadsPerStack() { + return maxReadsPerStack; + } + + public int getMinDistanceBetweenStacks() { + return minDistanceBetweenStacks; + } + + public int getMaxDistanceBetweenStacks() { + return maxDistanceBetweenStacks; + } + + public int getMinReadLength() { + return minReadLength; + } + + public int getMaxReadLength() { + return maxReadLength; + } + + public int getNumUnmappedReads() { + return numUnmappedReads; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..a4d7c5146 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; + +import java.util.ArrayList; +import java.util.List; + +/** + * A class for analyzing and validating the read stream produced by an ArtificialSingleSampleReadStream. + * + * Collects various statistics about the stream of reads it's fed, and validates the stream + * by checking whether the collected statistics match the nominal properties of the stream. + * + * Subclasses are expected to override the validate() method in order to check whether an artificial + * read stream has been *transformed* in some way (eg., by downsampling or some other process), rather + * than merely checking whether the stream matches its original properties. + * + * Usage is simple: + * + * ArtificialSingleSampleReadStreamAnalyzer analyzer = new ArtificialSingleSampleReadStreamAnalyzer(originalStream); + * analyzer.analyze(originalOrTransformedStream); + * analyzer.validate(); // override this method if you want to check whether the stream has been transformed + * // in a certain way relative to the original stream + * + * @author David Roazen + */ +public class ArtificialSingleSampleReadStreamAnalyzer { + protected ArtificialSingleSampleReadStream originalStream; + protected SAMRecord lastRead; + protected int totalReads; + protected boolean allSamplesMatch; + protected int numContigs; + protected List stacksPerContig; + protected Integer minReadsPerStack; + protected Integer maxReadsPerStack; + protected Integer minDistanceBetweenStacks; + protected Integer maxDistanceBetweenStacks; + protected Integer minReadLength; + protected Integer maxReadLength; + protected int numUnmappedReads; + + protected int currentContigNumStacks; + protected int currentStackNumReads; + + /** + * Construct a new read stream analyzer, providing an ArtificialSingleSampleReadStream that will + * serve as the basis for comparison after the analysis is complete. + * + * @param originalStream the original ArtificialSingleSampleReadStream upon which the stream + * that will be fed to the analyzer is based + */ + public ArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream ) { + this.originalStream = originalStream; + reset(); + } + + /** + * Reset all read stream statistics collected by this analyzer to prepare for a fresh run + */ + public void reset() { + lastRead = null; + totalReads = 0; + allSamplesMatch = true; + numContigs = 0; + stacksPerContig = new ArrayList(); + minReadsPerStack = null; + maxReadsPerStack = null; + minDistanceBetweenStacks = null; + maxDistanceBetweenStacks = null; + minReadLength = null; + maxReadLength = null; + numUnmappedReads = 0; + currentContigNumStacks = 0; + currentStackNumReads = 0; + } + + /** + * Collect statistics on the stream of reads passed in + * + * @param stream the stream of reads to analyze + */ + public void analyze( Iterable stream ) { + for ( SAMRecord read : stream ) { + update(read); + } + finalizeStats(); + } + + /** + * Validate the stream by checking whether our collected statistics match the properties of the + * original stream. Throws a ReviewedStingException if the stream is invalid. + * + * Override this method if you want to check whether the stream has been transformed in some + * way relative to the original stream. + */ + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + if ( minReadsPerStack < originalStream.getMinReadsPerStack() ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads"); + } + if ( maxReadsPerStack > originalStream.getMaxReadsPerStack() ) { + throw new ReviewedStingException("stack had more than the maximum number of reads"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } + + public void update( SAMRecord read ) { + if ( read.getReadUnmappedFlag() ) { + numUnmappedReads++; + + if ( numUnmappedReads == 1 && lastRead != null ) { + processContigChange(); + numContigs--; + } + } + else if ( lastRead == null ) { + numContigs = 1; + currentContigNumStacks = 1; + currentStackNumReads = 1; + } + else if ( ! read.getReferenceIndex().equals(lastRead.getReferenceIndex()) ) { + processContigChange(); + } + else if ( read.getAlignmentStart() != lastRead.getAlignmentStart() ) { + processStackChangeWithinContig(read); + } + else { + currentStackNumReads++; + } + + updateReadLength(read.getReadLength()); + allSamplesMatch = allSamplesMatch && readHasCorrectSample(read); + totalReads++; + + lastRead = read; + } + + + private void processContigChange() { + numContigs++; + + stacksPerContig.add(currentContigNumStacks); + currentContigNumStacks = 1; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + } + + private void processStackChangeWithinContig( SAMRecord read ) { + currentContigNumStacks++; + + updateReadsPerStack(currentStackNumReads); + currentStackNumReads = 1; + + updateDistanceBetweenStacks(read.getAlignmentStart() - lastRead.getAlignmentStart()); + } + + private void updateReadsPerStack( int stackReadCount ) { + if ( minReadsPerStack == null || stackReadCount < minReadsPerStack ) { + minReadsPerStack = stackReadCount; + } + if ( maxReadsPerStack == null || stackReadCount > maxReadsPerStack ) { + maxReadsPerStack = stackReadCount; + } + } + + private void updateDistanceBetweenStacks( int stackDistance ) { + if ( minDistanceBetweenStacks == null || stackDistance < minDistanceBetweenStacks ) { + minDistanceBetweenStacks = stackDistance; + } + if ( maxDistanceBetweenStacks == null || stackDistance > maxDistanceBetweenStacks ) { + maxDistanceBetweenStacks = stackDistance; + } + } + + private void updateReadLength( int readLength ) { + if ( minReadLength == null || readLength < minReadLength ) { + minReadLength = readLength; + } + if ( maxReadLength == null || readLength > maxReadLength ) { + maxReadLength = readLength; + } + } + + private boolean readHasCorrectSample( SAMRecord read ) { + return originalStream.getReadGroupID().equals(read.getAttribute("RG")); + } + + public void finalizeStats() { + if ( lastRead != null && ! lastRead.getReadUnmappedFlag() ) { + stacksPerContig.add(currentContigNumStacks); + updateReadsPerStack(currentStackNumReads); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index c9b3a2df8..9fdb48b34 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -31,6 +31,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.Arrays; import java.util.HashMap; +import java.util.List; import java.util.Map; /** @@ -59,8 +60,9 @@ public class GATKSAMRecord extends BAMRecord { private String mReadString = null; private GATKSAMReadGroupRecord mReadGroup = null; private byte[] reducedReadCounts = null; - private int softStart = -1; - private int softEnd = -1; + private final static int UNINITIALIZED = -1; + private int softStart = UNINITIALIZED; + private int softEnd = UNINITIALIZED; // because some values can be null, we don't want to duplicate effort private boolean retrievedReadGroup = false; @@ -228,8 +230,7 @@ public class GATKSAMRecord extends BAMRecord { if( quals == null ) { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will - // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, EventType.BASE_INSERTION); + // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 } return quals; } @@ -246,7 +247,6 @@ public class GATKSAMRecord extends BAMRecord { quals = new byte[getBaseQualities().length]; Arrays.fill(quals, (byte) 45); // Some day in the future when base insertion and base deletion quals exist the samtools API will // be updated and the original quals will be pulled here, but for now we assume the original quality is a flat Q45 - setBaseQualities(quals, EventType.BASE_DELETION); } return quals; } @@ -262,7 +262,7 @@ public class GATKSAMRecord extends BAMRecord { public void setReadGroup( final GATKSAMReadGroupRecord readGroup ) { mReadGroup = readGroup; retrievedReadGroup = true; - setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! + setAttribute("RG", mReadGroup.getId()); // todo -- this should be standardized, but we don't have access to SAMTagUtils! } /////////////////////////////////////////////////////////////////////////////// @@ -367,15 +367,15 @@ public class GATKSAMRecord extends BAMRecord { * Clears all attributes except ReadGroup of the read. */ public GATKSAMRecord simplify () { - GATKSAMReadGroupRecord rg = getReadGroup(); // save the read group information + GATKSAMReadGroupRecord rg = getReadGroup(); // save the read group information byte[] insQuals = (this.getAttribute(BQSR_BASE_INSERTION_QUALITIES) == null) ? null : getBaseInsertionQualities(); byte[] delQuals = (this.getAttribute(BQSR_BASE_DELETION_QUALITIES) == null) ? null : getBaseDeletionQualities(); - this.clearAttributes(); // clear all attributes from the read - this.setReadGroup(rg); // restore read group + this.clearAttributes(); // clear all attributes from the read + this.setReadGroup(rg); // restore read group if (insQuals != null) - this.setBaseQualities(insQuals, EventType.BASE_INSERTION); // restore base insertion if we had any + this.setBaseQualities(insQuals, EventType.BASE_INSERTION); // restore base insertion if we had any if (delQuals != null) - this.setBaseQualities(delQuals, EventType.BASE_DELETION); // restore base deletion if we had any + this.setBaseQualities(delQuals, EventType.BASE_DELETION); // restore base deletion if we had any return this; } @@ -387,15 +387,16 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped start of the read taking soft clips (but not hard clips) into account */ public int getSoftStart() { - if (softStart < 0) { - int start = this.getUnclippedStart(); - for (CigarElement cigarElement : this.getCigar().getCigarElements()) { - if (cigarElement.getOperator() == CigarOperator.HARD_CLIP) - start += cigarElement.getLength(); - else + if ( softStart == UNINITIALIZED ) { + softStart = getAlignmentStart(); + for (final CigarElement cig : getCigar().getCigarElements()) { + final CigarOperator op = cig.getOperator(); + + if (op == CigarOperator.SOFT_CLIP) + softStart -= cig.getLength(); + else if (op != CigarOperator.HARD_CLIP) break; } - softStart = start; } return softStart; } @@ -408,24 +409,26 @@ public class GATKSAMRecord extends BAMRecord { * @return the unclipped end of the read taking soft clips (but not hard clips) into account */ public int getSoftEnd() { - if (softEnd < 0) { - int stop = this.getUnclippedStart(); + if ( softEnd == UNINITIALIZED ) { + boolean foundAlignedBase = false; + softEnd = getAlignmentEnd(); + final List cigs = getCigar().getCigarElements(); + for (int i = cigs.size() - 1; i >= 0; --i) { + final CigarElement cig = cigs.get(i); + final CigarOperator op = cig.getOperator(); - if (ReadUtils.readIsEntirelyInsertion(this)) - return stop; - - int shift = 0; - CigarOperator lastOperator = null; - for (CigarElement cigarElement : this.getCigar().getCigarElements()) { - stop += shift; - lastOperator = cigarElement.getOperator(); - if (cigarElement.getOperator().consumesReferenceBases() || cigarElement.getOperator() == CigarOperator.SOFT_CLIP || cigarElement.getOperator() == CigarOperator.HARD_CLIP) - shift = cigarElement.getLength(); - else - shift = 0; + if (op == CigarOperator.SOFT_CLIP) // assumes the soft clip that we found is at the end of the aligned read + softEnd += cig.getLength(); + else if (op != CigarOperator.HARD_CLIP) { + foundAlignedBase = true; + break; + } + } + if( !foundAlignedBase ) { // for example 64H14S, the soft end is actually the same as the alignment end + softEnd = getAlignmentEnd(); } - softEnd = (lastOperator == CigarOperator.HARD_CLIP) ? stop-1 : stop+shift-1 ; } + return softEnd; } diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java new file mode 100644 index 000000000..b30198608 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactory.java @@ -0,0 +1,158 @@ +/* + * The MIT License + * + * Copyright (c) 2009 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +package org.broadinstitute.sting.utils.threading; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.apache.log4j.Priority; +import org.broadinstitute.sting.utils.AutoFormattingTime; + +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadInfo; +import java.lang.management.ThreadMXBean; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.List; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.ThreadFactory; +import java.util.concurrent.TimeUnit; + +/** + * Creates threads that automatically monitor their efficiency via the parent ThreadEfficiencyMonitor + * + * User: depristo + * Date: 8/14/12 + * Time: 8:47 AM + */ +@Invariant({ + "activeThreads.size() <= nThreadsToCreate", + "countDownLatch.getCount() <= nThreadsToCreate", + "nThreadsCreated <= nThreadsToCreate" +}) +public class EfficiencyMonitoringThreadFactory extends ThreadEfficiencyMonitor implements ThreadFactory { + final int nThreadsToCreate; + final List activeThreads; + + int nThreadsCreated = 0; + + /** + * Counts down the number of active activeThreads whose runtime info hasn't been incorporated into + * times. Counts down from nThreadsToCreate to 0, at which point any code waiting + * on the final times is freed to run. + */ + final CountDownLatch countDownLatch; + + /** + * Create a new factory generating threads whose runtime and contention + * behavior is tracked in this factory. + * + * @param nThreadsToCreate the number of threads we will create in the factory before it's considered complete + */ + public EfficiencyMonitoringThreadFactory(final int nThreadsToCreate) { + super(); + if ( nThreadsToCreate <= 0 ) throw new IllegalArgumentException("nThreadsToCreate <= 0: " + nThreadsToCreate); + + this.nThreadsToCreate = nThreadsToCreate; + activeThreads = new ArrayList(nThreadsToCreate); + countDownLatch = new CountDownLatch(nThreadsToCreate); + } + + /** + * How many threads have been created by this factory so far? + * @return + */ + @Ensures("result >= 0") + public int getNThreadsCreated() { + return nThreadsCreated; + } + + /** + * Only useful for testing, so that we can wait for all of the threads in the factory to complete running + * + * @throws InterruptedException + */ + protected void waitForAllThreadsToComplete() throws InterruptedException { + countDownLatch.await(); + } + + @Ensures({ + "activeThreads.size() <= old(activeThreads.size())", + "! activeThreads.contains(thread)", + "countDownLatch.getCount() <= old(countDownLatch.getCount())" + }) + @Override + public synchronized void threadIsDone(final Thread thread) { + nThreadsAnalyzed++; + + if ( DEBUG ) logger.warn(" Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); + + super.threadIsDone(thread); + + // remove the thread from the list of active activeThreads, if it's in there, and decrement the countdown latch + if ( activeThreads.remove(thread) ) { + // one less thread is live for those blocking on all activeThreads to be complete + countDownLatch.countDown(); + if ( DEBUG ) logger.warn(" -> Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); + } + } + + /** + * Create a new thread from this factory + * + * @param runnable + * @return + */ + @Override + @Ensures({ + "activeThreads.size() > old(activeThreads.size())", + "activeThreads.contains(result)", + "nThreadsCreated == old(nThreadsCreated) + 1" + }) + public synchronized Thread newThread(final Runnable runnable) { + if ( activeThreads.size() >= nThreadsToCreate) + throw new IllegalStateException("Attempting to create more activeThreads than allowed by constructor argument nThreadsToCreate " + nThreadsToCreate); + + nThreadsCreated++; + final Thread myThread = new TrackingThread(runnable); + activeThreads.add(myThread); + return myThread; + } + + /** + * A wrapper around Thread that tracks the runtime of the thread and calls threadIsDone() when complete + */ + private class TrackingThread extends Thread { + private TrackingThread(Runnable runnable) { + super(runnable); + } + + @Override + public void run() { + super.run(); + threadIsDone(this); + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java new file mode 100644 index 000000000..b25375b87 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/NamedThreadFactory.java @@ -0,0 +1,26 @@ +package org.broadinstitute.sting.utils.threading; + +import java.util.concurrent.ThreadFactory; + +/** + * Thread factor that produces threads with a given name pattern + * + * User: depristo + * Date: 9/5/12 + * Time: 9:22 PM + * + */ +public class NamedThreadFactory implements ThreadFactory { + static int id = 0; + final String format; + + public NamedThreadFactory(String format) { + this.format = format; + String.format(format, id); // test the name + } + + @Override + public Thread newThread(Runnable r) { + return new Thread(r, String.format(format, id++)); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java b/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java deleted file mode 100644 index 39d5c1497..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactory.java +++ /dev/null @@ -1,293 +0,0 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -package org.broadinstitute.sting.utils.threading; - -import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; -import org.apache.log4j.Logger; -import org.apache.log4j.Priority; -import org.broadinstitute.sting.utils.AutoFormattingTime; - -import java.lang.management.ManagementFactory; -import java.lang.management.ThreadInfo; -import java.lang.management.ThreadMXBean; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.EnumMap; -import java.util.List; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.ThreadFactory; - -/** - * Create activeThreads, collecting statistics about their running state over time - * - * Uses a ThreadMXBean to capture info via ThreadInfo - * - * User: depristo - * Date: 8/14/12 - * Time: 8:47 AM - */ -@Invariant({ - "activeThreads.size() <= nThreadsToCreate", - "countDownLatch.getCount() <= nThreadsToCreate", - "nThreadsToCreated <= nThreadsToCreate" -}) -public class StateMonitoringThreadFactory implements ThreadFactory { - protected static final boolean DEBUG = false; - private static Logger logger = Logger.getLogger(StateMonitoringThreadFactory.class); - public static final List TRACKED_STATES = Arrays.asList(Thread.State.BLOCKED, Thread.State.RUNNABLE, Thread.State.WAITING); - - // todo -- it would be nice to not have to specify upfront the number of threads. - // todo -- can we dynamically increment countDownLatch? It seems not... - final int nThreadsToCreate; - final List activeThreads; - final EnumMap times = new EnumMap(Thread.State.class); - - int nThreadsToCreated = 0; - - /** - * The bean used to get the thread info about blocked and waiting times - */ - final ThreadMXBean bean; - - /** - * Counts down the number of active activeThreads whose runtime info hasn't been incorporated into - * times. Counts down from nThreadsToCreate to 0, at which point any code waiting - * on the final times is freed to run. - */ - final CountDownLatch countDownLatch; - - /** - * Instead of RUNNABLE we want to print running. This map goes from Thread.State names to human readable ones - */ - final static EnumMap PRETTY_NAMES = new EnumMap(Thread.State.class); - static { - PRETTY_NAMES.put(Thread.State.RUNNABLE, "running"); - PRETTY_NAMES.put(Thread.State.BLOCKED, "blocked"); - PRETTY_NAMES.put(Thread.State.WAITING, "waiting"); - } - - /** - * Create a new factory generating threads whose runtime and contention - * behavior is tracked in this factory. - * - * @param nThreadsToCreate the number of threads we will create in the factory before it's considered complete - * // TODO -- remove argument when we figure out how to implement this capability - */ - public StateMonitoringThreadFactory(final int nThreadsToCreate) { - if ( nThreadsToCreate <= 0 ) throw new IllegalArgumentException("nThreadsToCreate <= 0: " + nThreadsToCreate); - - this.nThreadsToCreate = nThreadsToCreate; - activeThreads = new ArrayList(nThreadsToCreate); - - // initialize times to 0 - for ( final Thread.State state : Thread.State.values() ) - times.put(state, 0l); - - // get the bean, and start tracking - bean = ManagementFactory.getThreadMXBean(); - if ( bean.isThreadContentionMonitoringSupported() ) - bean.setThreadContentionMonitoringEnabled(true); - else - logger.warn("Thread contention monitoring not supported, we cannot track GATK multi-threaded efficiency"); - //bean.setThreadCpuTimeEnabled(true); - - countDownLatch = new CountDownLatch(nThreadsToCreate); - } - - /** - * Get the time spent in state across all threads created by this factory - * - * @param state on of the TRACKED_STATES - * @return the time in milliseconds - */ - @Ensures({"result >= 0", "TRACKED_STATES.contains(state)"}) - public synchronized long getStateTime(final Thread.State state) { - return times.get(state); - } - - /** - * Get the total time spent in all states across all threads created by this factory - * - * @return the time in milliseconds - */ - @Ensures({"result >= 0"}) - public synchronized long getTotalTime() { - long total = 0; - for ( final long time : times.values() ) - total += time; - return total; - } - - /** - * Get the fraction of time spent in state across all threads created by this factory - * - * @return the fraction (0.0-1.0) of time spent in state over all state times of all threads - */ - @Ensures({"result >= 0.0", "result <= 1.0", "TRACKED_STATES.contains(state)"}) - public synchronized double getStateFraction(final Thread.State state) { - return getStateTime(state) / (1.0 * Math.max(getTotalTime(), 1)); - } - - /** - * How many threads have been created by this factory so far? - * @return - */ - @Ensures("result >= 0") - public int getNThreadsCreated() { - return nThreadsToCreated; - } - - public void waitForAllThreadsToComplete() throws InterruptedException { - countDownLatch.await(); - } - - @Override - public synchronized String toString() { - final StringBuilder b = new StringBuilder(); - - b.append("total ").append(getTotalTime()).append(" "); - for ( final Thread.State state : TRACKED_STATES ) { - b.append(state).append(" ").append(getStateTime(state)).append(" "); - } - - return b.toString(); - } - - /** - * Print usage information about threads from this factory to logger - * with the INFO priority - * - * @param logger - */ - public synchronized void printUsageInformation(final Logger logger) { - printUsageInformation(logger, Priority.INFO); - } - - /** - * Print usage information about threads from this factory to logger - * with the provided priority - * - * @param logger - */ - public synchronized void printUsageInformation(final Logger logger, final Priority priority) { - logger.log(priority, "Number of activeThreads used: " + getNThreadsCreated()); - logger.log(priority, "Total runtime " + new AutoFormattingTime(getTotalTime() / 1000.0)); - for ( final Thread.State state : TRACKED_STATES ) { - logger.log(priority, String.format(" Fraction of time spent %s is %.2f (%s)", - prettyName(state), getStateFraction(state), new AutoFormattingTime(getStateTime(state) / 1000.0))); - } - logger.log(priority, String.format("Efficiency of multi-threading: %.2f%% of time spent doing productive work", - getStateFraction(Thread.State.RUNNABLE) * 100)); - } - - private String prettyName(final Thread.State state) { - return PRETTY_NAMES.get(state); - } - - /** - * Create a new thread from this factory - * - * @param runnable - * @return - */ - @Override - @Ensures({ - "activeThreads.size() > old(activeThreads.size())", - "activeThreads.contains(result)", - "nThreadsToCreated == old(nThreadsToCreated) + 1" - }) - public synchronized Thread newThread(final Runnable runnable) { - if ( activeThreads.size() >= nThreadsToCreate) - throw new IllegalStateException("Attempting to create more activeThreads than allowed by constructor argument nThreadsToCreate " + nThreadsToCreate); - - nThreadsToCreated++; - final Thread myThread = new TrackingThread(runnable); - activeThreads.add(myThread); - return myThread; - } - - /** - * Update the information about completed thread that ran for runtime in milliseconds - * - * This method updates all of the key timing and tracking information in the factory so that - * thread can be retired. After this call the factory shouldn't have a pointer to the thread any longer - * - * @param thread - * @param runtimeInMilliseconds - */ - @Ensures({ - "activeThreads.size() < old(activeThreads.size())", - "! activeThreads.contains(thread)", - "getTotalTime() >= old(getTotalTime())", - "countDownLatch.getCount() < old(countDownLatch.getCount())" - }) - private synchronized void threadIsDone(final Thread thread, final long runtimeInMilliseconds) { - if ( DEBUG ) logger.warn(" Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); - if ( DEBUG ) logger.warn("UpdateThreadInfo called"); - - final ThreadInfo info = bean.getThreadInfo(thread.getId()); - if ( info != null ) { - if ( DEBUG ) logger.warn("Updating thread total runtime " + runtimeInMilliseconds + " of which blocked " + info.getBlockedTime() + " and waiting " + info.getWaitedTime()); - incTimes(Thread.State.BLOCKED, info.getBlockedTime()); - incTimes(Thread.State.WAITING, info.getWaitedTime()); - incTimes(Thread.State.RUNNABLE, runtimeInMilliseconds - info.getWaitedTime() - info.getBlockedTime()); - } - - // remove the thread from the list of active activeThreads - if ( ! activeThreads.remove(thread) ) - throw new IllegalStateException("Thread " + thread + " not in list of active activeThreads"); - - // one less thread is live for those blocking on all activeThreads to be complete - countDownLatch.countDown(); - if ( DEBUG ) logger.warn(" -> Countdown " + countDownLatch.getCount() + " in thread " + Thread.currentThread().getName()); - } - - /** - * Helper function that increments the times counter by by for state - * - * @param state - * @param by - */ - private synchronized void incTimes(final Thread.State state, final long by) { - times.put(state, times.get(state) + by); - } - - /** - * A wrapper around Thread that tracks the runtime of the thread and calls threadIsDone() when complete - */ - private class TrackingThread extends Thread { - private TrackingThread(Runnable runnable) { - super(runnable); - } - - @Override - public void run() { - final long startTime = System.currentTimeMillis(); - super.run(); - final long endTime = System.currentTimeMillis(); - threadIsDone(this, endTime - startTime); - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java new file mode 100644 index 000000000..9159f5657 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadEfficiencyMonitor.java @@ -0,0 +1,207 @@ +package org.broadinstitute.sting.utils.threading; + +import com.google.java.contract.Ensures; +import com.google.java.contract.Invariant; +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; +import org.apache.log4j.Priority; +import org.broadinstitute.sting.utils.AutoFormattingTime; + +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadInfo; +import java.lang.management.ThreadMXBean; +import java.util.EnumMap; +import java.util.concurrent.TimeUnit; + +/** + * Uses an MXBean to monitor thread efficiency + * + * Once the monitor is created, calls to threadIsDone() can be used to add information + * about the efficiency of the provided thread to this monitor. + * + * Provides simple print() for displaying efficiency information to a logger + * + * User: depristo + * Date: 8/22/12 + * Time: 10:48 AM + */ +@Invariant({"nThreadsAnalyzed >= 0"}) +public class ThreadEfficiencyMonitor { + protected static final boolean DEBUG = false; + protected static Logger logger = Logger.getLogger(EfficiencyMonitoringThreadFactory.class); + final EnumMap times = new EnumMap(State.class); + + /** + * The number of threads we've included in our efficiency monitoring + */ + int nThreadsAnalyzed = 0; + + /** + * The bean used to get the thread info about blocked and waiting times + */ + final ThreadMXBean bean; + + public ThreadEfficiencyMonitor() { + bean = ManagementFactory.getThreadMXBean(); + + // get the bean, and start tracking + if ( bean.isThreadContentionMonitoringSupported() ) + bean.setThreadContentionMonitoringEnabled(true); + else + logger.warn("Thread contention monitoring not supported, we cannot track GATK multi-threaded efficiency"); + //bean.setThreadCpuTimeEnabled(true); + + if ( bean.isThreadCpuTimeSupported() ) + bean.setThreadCpuTimeEnabled(true); + else + logger.warn("Thread CPU monitoring not supported, we cannot track GATK multi-threaded efficiency"); + + // initialize times to 0 + for ( final State state : State.values() ) + times.put(state, 0l); + } + + private static long nanoToMilli(final long timeInNano) { + return TimeUnit.NANOSECONDS.toMillis(timeInNano); + } + + /** + * Get the time spent in state across all threads created by this factory + * + * @param state to get information about + * @return the time in milliseconds + */ + @Ensures({"result >= 0"}) + public synchronized long getStateTime(final State state) { + return times.get(state); + } + + /** + * Get the total time spent in all states across all threads created by this factory + * + * @return the time in milliseconds + */ + @Ensures({"result >= 0"}) + public synchronized long getTotalTime() { + long total = 0; + for ( final long time : times.values() ) + total += time; + return total; + } + + /** + * Get the fraction of time spent in state across all threads created by this factory + * + * @return the percentage (0.0-100.0) of time spent in state over all state times of all threads + */ + @Ensures({"result >= 0.0", "result <= 100.0"}) + public synchronized double getStatePercent(final State state) { + return (100.0 * getStateTime(state)) / Math.max(getTotalTime(), 1); + } + + public int getnThreadsAnalyzed() { + return nThreadsAnalyzed; + } + + @Override + public synchronized String toString() { + final StringBuilder b = new StringBuilder(); + + b.append("total ").append(getTotalTime()).append(" "); + for ( final State state : State.values() ) { + b.append(state).append(" ").append(getStateTime(state)).append(" "); + } + + return b.toString(); + } + + /** + * Print usage information about threads from this factory to logger + * with the INFO priority + * + * @param logger + */ + public synchronized void printUsageInformation(final Logger logger) { + printUsageInformation(logger, Priority.INFO); + } + + /** + * Print usage information about threads from this factory to logger + * with the provided priority + * + * @param logger + */ + public synchronized void printUsageInformation(final Logger logger, final Priority priority) { + logger.debug("Number of threads monitored: " + getnThreadsAnalyzed()); + logger.debug("Total runtime " + new AutoFormattingTime(TimeUnit.MILLISECONDS.toSeconds(getTotalTime()))); + for ( final State state : State.values() ) { + logger.debug(String.format("\tPercent of time spent %s is %.2f", state.getUserFriendlyName(), getStatePercent(state))); + } + logger.log(priority, String.format("CPU efficiency : %6.2f%% of time spent %s", getStatePercent(State.USER_CPU), State.USER_CPU.getUserFriendlyName())); + logger.log(priority, String.format("Walker inefficiency : %6.2f%% of time spent %s", getStatePercent(State.BLOCKING), State.BLOCKING.getUserFriendlyName())); + logger.log(priority, String.format("I/O inefficiency : %6.2f%% of time spent %s", getStatePercent(State.WAITING_FOR_IO), State.WAITING_FOR_IO.getUserFriendlyName())); + logger.log(priority, String.format("Thread inefficiency : %6.2f%% of time spent %s", getStatePercent(State.WAITING), State.WAITING.getUserFriendlyName())); + } + + /** + * Update the information about completed thread that ran for runtime in milliseconds + * + * This method updates all of the key timing and tracking information in the factory so that + * thread can be retired. After this call the factory shouldn't have a pointer to the thread any longer + * + * @param thread the thread whose information we are updating + */ + @Ensures({ + "getTotalTime() >= old(getTotalTime())" + }) + public synchronized void threadIsDone(final Thread thread) { + nThreadsAnalyzed++; + + if ( DEBUG ) logger.warn("UpdateThreadInfo called"); + + final long threadID = thread.getId(); + final ThreadInfo info = bean.getThreadInfo(thread.getId()); + final long totalTimeNano = bean.getThreadCpuTime(threadID); + final long userTimeNano = bean.getThreadUserTime(threadID); + final long systemTimeNano = totalTimeNano - userTimeNano; + final long userTimeInMilliseconds = nanoToMilli(userTimeNano); + final long systemTimeInMilliseconds = nanoToMilli(systemTimeNano); + + if ( info != null ) { + if ( DEBUG ) logger.warn("Updating thread with user runtime " + userTimeInMilliseconds + " and system runtime " + systemTimeInMilliseconds + " of which blocked " + info.getBlockedTime() + " and waiting " + info.getWaitedTime()); + incTimes(State.BLOCKING, info.getBlockedTime()); + incTimes(State.WAITING, info.getWaitedTime()); + incTimes(State.USER_CPU, userTimeInMilliseconds); + incTimes(State.WAITING_FOR_IO, systemTimeInMilliseconds); + } + } + + /** + * Helper function that increments the times counter by by for state + * + * @param state + * @param by + */ + @Requires({"state != null", "by >= 0"}) + @Ensures("getTotalTime() == old(getTotalTime()) + by") + private synchronized void incTimes(final State state, final long by) { + times.put(state, times.get(state) + by); + } + + public enum State { + BLOCKING("blocking on synchronized data structures"), + WAITING("waiting on some other thread"), + USER_CPU("doing productive CPU work"), + WAITING_FOR_IO("waiting for I/O"); + + private final String userFriendlyName; + + private State(String userFriendlyName) { + this.userFriendlyName = userFriendlyName; + } + + public String getUserFriendlyName() { + return userFriendlyName; + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java new file mode 100644 index 000000000..cc50152ac --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/threading/ThreadLocalArray.java @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.threading; + +import java.lang.reflect.Array; + +/** + * ThreadLocal implementation for arrays + * + * Example usage: + * + * private ThreadLocal threadLocalByteArray = new ThreadLocalArray(length, byte.class); + * .... + * byte[] byteArray = threadLocalByteArray.get(); + * + * @param the type of the array itself (eg., int[], double[], etc.) + * + * @author David Roazen + */ +public class ThreadLocalArray extends ThreadLocal { + private int arraySize; + private Class arrayElementType; + + /** + * Create a new ThreadLocalArray + * + * @param arraySize desired length of the array + * @param arrayElementType type of the elements within the array (eg., Byte.class, Integer.class, etc.) + */ + public ThreadLocalArray( int arraySize, Class arrayElementType ) { + super(); + + this.arraySize = arraySize; + this.arrayElementType = arrayElementType; + } + + @Override + @SuppressWarnings("unchecked") + protected T initialValue() { + return (T)Array.newInstance(arrayElementType, arraySize); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java index 2c312678e..85c925204 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Allele.java @@ -105,7 +105,7 @@ public class Allele implements Comparable { if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele"); } else { - bases = BaseUtils.convertToUpperCase(bases); + BaseUtils.convertToUpperCase(bases); } this.isRef = isRef; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java index fae0a7c4c..67e80cf3c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java @@ -288,6 +288,28 @@ public abstract class Genotype implements Comparable { return hasLikelihoods() ? GenotypeLikelihoods.fromPLs(getPL()) : null; } + /** + * Are all likelihoods for this sample non-informative? + * + * Returns true if all PLs are 0 => 0,0,0 => true + * 0,0,0,0,0,0 => true + * 0,10,100 => false + * + * @return true if all samples PLs are equal and == 0 + */ + public boolean isNonInformative() { + if ( getPL() == null ) + return true; + else { + for ( final int PL : getPL() ) { + if ( PL != 0 ) + return false; + } + + return true; + } + } + /** * Unsafe low-level accessor the PL field itself, may be null. * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java index 0ee32fa2e..8fd792d3b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypeBuilder.java @@ -53,6 +53,9 @@ import java.util.*; */ @Invariant({"alleles != null"}) public final class GenotypeBuilder { + private static final List HAPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL); + private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); + private String sampleName = null; private List alleles = Collections.emptyList(); @@ -90,6 +93,23 @@ public final class GenotypeBuilder { return new GenotypeBuilder(sampleName, alleles).PL(gls).make(); } + /** + * Create a new Genotype object for a sample that's missing from the VC (i.e., in + * the output header). Defaults to a diploid no call genotype ./. + * + * @param sampleName the name of this sample + * @return an initialized Genotype with sampleName that's a diploid ./. no call genotype + */ + public static Genotype createMissing(final String sampleName, final int ploidy) { + final GenotypeBuilder builder = new GenotypeBuilder(sampleName); + switch ( ploidy ) { + case 1: builder.alleles(HAPLOID_NO_CALL); break; + case 2: builder.alleles(DIPLOID_NO_CALL); break; + default: builder.alleles(Collections.nCopies(ploidy, Allele.NO_CALL)); break; + } + return builder.make(); + } + /** * Create a empty builder. Both a sampleName and alleles must be provided * before trying to make a Genotype from this builder. diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java index ba8668fa9..f306bac4d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/GenotypesContext.java @@ -25,7 +25,6 @@ package org.broadinstitute.sting.utils.variantcontext; import com.google.java.contract.Ensures; -import com.google.java.contract.Invariant; import com.google.java.contract.Requires; import java.util.*; @@ -413,14 +412,26 @@ public class GenotypesContext implements List { return getGenotypes().get(i); } + /** + * What is the max ploidy among all samples? Returns defaultPloidy if no genotypes are present + * + * @param defaultPloidy the default ploidy, if all samples are no-called + * @return + */ @Ensures("result >= 0") - public int getMaxPloidy() { + public int getMaxPloidy(final int defaultPloidy) { + if ( defaultPloidy < 0 ) throw new IllegalArgumentException("defaultPloidy must be greater than or equal to 0"); + if ( maxPloidy == -1 ) { maxPloidy = 0; // necessary in the case where there are no genotypes for ( final Genotype g : getGenotypes() ) { maxPloidy = Math.max(g.getPloidy(), maxPloidy); } + + // everything is no called so we return the default ploidy + if ( maxPloidy == 0 ) maxPloidy = defaultPloidy; } + return maxPloidy; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 1fe6b8652..27a5b0c24 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -178,9 +178,8 @@ import java.util.*; */ public class VariantContext implements Feature { // to enable tribble integration private final static boolean WARN_ABOUT_BAD_END = true; + private final static int MAX_ALLELE_SIZE_FOR_NON_SV = 150; final protected static Logger logger = Logger.getLogger(VariantContext.class); - - private boolean fullyDecoded = false; protected CommonInfo commonInfo = null; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; @@ -530,6 +529,28 @@ public class VariantContext implements Feature { // to enable tribble integratio return getType() == Type.SYMBOLIC; } + public boolean isStructuralIndel() { + if ( getType() == Type.INDEL ) { + List sizes = getIndelLengths(); + if ( sizes != null ) { + for ( Integer length : sizes ) { + if ( length > MAX_ALLELE_SIZE_FOR_NON_SV ) { + return true; + } + } + } + } + return false; + } + + /** + * + * @return true if the variant is symbolic or a large indel + */ + public boolean isSymbolicOrSV() { + return isSymbolic() || isStructuralIndel(); + } + public boolean isMNP() { return getType() == Type.MNP; } @@ -621,14 +642,15 @@ public class VariantContext implements Feature { // to enable tribble integratio } /** - * Returns the maximum ploidy of all samples in this VC, or -1 if there are no genotypes + * Returns the maximum ploidy of all samples in this VC, or default if there are no genotypes * * This function is caching, so it's only expensive on the first call * - * @return -1, or the max ploidy + * @param defaultPloidy the default ploidy, if all samples are no-called + * @return default, or the max ploidy */ - public int getMaxPloidy() { - return genotypes.getMaxPloidy(); + public int getMaxPloidy(final int defaultPloidy) { + return genotypes.getMaxPloidy(defaultPloidy); } /** @@ -1049,15 +1071,17 @@ public class VariantContext implements Feature { // to enable tribble integratio if ( g.isCalled() ) observedAlleles.addAll(g.getAlleles()); } + if ( observedAlleles.contains(Allele.NO_CALL) ) + observedAlleles.remove(Allele.NO_CALL); if ( reportedAlleles.size() != observedAlleles.size() ) - throw new TribbleException.InternalCodecException(String.format("the ALT allele(s) for the record at position %s:%d do not match what is observed in the per-sample genotypes", getChr(), getStart())); + throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getChr(), getStart())); int originalSize = reportedAlleles.size(); // take the intersection and see if things change observedAlleles.retainAll(reportedAlleles); if ( observedAlleles.size() != originalSize ) - throw new TribbleException.InternalCodecException(String.format("the ALT allele(s) for the record at position %s:%d do not match what is observed in the per-sample genotypes", getChr(), getStart())); + throw new TribbleException.InternalCodecException(String.format("one or more of the ALT allele(s) for the record at position %s:%d are not observed at all in the sample genotypes", getChr(), getStart())); } public void validateChromosomeCounts() { @@ -1250,6 +1274,7 @@ public class VariantContext implements Feature { // to enable tribble integratio // performs a pairwise comparison of a single alternate allele against the reference allele (whereas the MIXED type // is reserved for cases of multiple alternate alleles of different types). Therefore, if we've reached this point // in the code (so we're not a SNP, MNP, or symbolic allele), we absolutely must be an INDEL. + return Type.INDEL; // old incorrect logic: @@ -1494,15 +1519,32 @@ public class VariantContext implements Feature { // to enable tribble integratio return best; } + /** + * Lookup the index of allele in this variant context + * + * @param allele the allele whose index we want to get + * @return the index of the allele into getAlleles(), or -1 if it cannot be found + */ + public int getAlleleIndex(final Allele allele) { + return getAlleles().indexOf(allele); + } + + /** + * Return the allele index #getAlleleIndex for each allele in alleles + * + * @param alleles the alleles we want to look up + * @return a list of indices for each allele, in order + */ + public List getAlleleIndices(final Collection alleles) { + final List indices = new LinkedList(); + for ( final Allele allele : alleles ) + indices.add(getAlleleIndex(allele)); + return indices; + } + public int[] getGLIndecesOfAlternateAllele(Allele targetAllele) { - - int index = 1; - for ( Allele allele : getAlternateAlleles() ) { - if ( allele.equals(targetAllele) ) - break; - index++; - } - + final int index = getAlleleIndex(targetAllele); + if ( index == -1 ) throw new IllegalArgumentException("Allele " + targetAllele + " not in this VariantContex " + this); return GenotypeLikelihoods.getPLIndecesOfAlleles(0, index); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java index d8ab4bd23..40ac089ef 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextBuilder.java @@ -94,6 +94,7 @@ public class VariantContextBuilder { this.start = start; this.stop = stop; this.alleles = alleles; + this.attributes = Collections.emptyMap(); // immutable toValidate.add(VariantContext.Validation.ALLELES); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index d7e4a7135..81959c998 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -32,8 +32,8 @@ import org.apache.log4j.Logger; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -47,7 +47,6 @@ public class VariantContextUtils { public final static String MERGE_REF_IN_ALL = "ReferenceInAll"; public final static String MERGE_FILTER_PREFIX = "filterIn"; - private static final List DIPLOID_NO_CALL = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); private static Set MISSING_KEYS_WARNED_ABOUT = new HashSet(); final public static JexlEngine engine = new JexlEngine(); @@ -60,31 +59,6 @@ public class VariantContextUtils { engine.setDebug(false); } - /** - * Ensures that VC contains all of the samples in allSamples by adding missing samples to - * the resulting VC with default diploid ./. genotypes - * - * @param vc the VariantContext - * @param allSamples all of the samples needed - * @return a new VariantContext with missing samples added - */ - public static VariantContext addMissingSamples(final VariantContext vc, final Set allSamples) { - // TODO -- what's the fastest way to do this calculation? - final Set missingSamples = new HashSet(allSamples); - missingSamples.removeAll(vc.getSampleNames()); - - if ( missingSamples.isEmpty() ) - return vc; - else { - //logger.warn("Adding " + missingSamples.size() + " missing samples to called context"); - final GenotypesContext gc = GenotypesContext.copy(vc.getGenotypes()); - for ( final String missing : missingSamples ) { - gc.add(new GenotypeBuilder(missing).alleles(DIPLOID_NO_CALL).make()); - } - return new VariantContextBuilder(vc).genotypes(gc).make(); - } - } - /** * Update the attributes of the attributes map given the VariantContext to reflect the * proper chromosome-based VCF tags @@ -183,11 +157,8 @@ public class VariantContextUtils { builder.attributes(calculateChromosomeCounts(vc, new HashMap(vc.getAttributes()), removeStaleValues, founderIds)); } - public static Genotype removePLs(Genotype g) { - if ( g.hasLikelihoods() ) - return new GenotypeBuilder(g).noPL().make(); - else - return g; + public static Genotype removePLsAndAD(final Genotype g) { + return ( g.hasLikelihoods() || g.hasAD() ) ? new GenotypeBuilder(g).noPL().noAD().make() : g; } public final static VCFCompoundHeaderLine getMetaDataForField(final VCFHeader header, final String field) { @@ -599,7 +570,7 @@ public class VariantContextUtils { } // if we have more alternate alleles in the merged VC than in one or more of the - // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF + // original VCs, we need to strip out the GL/PLs (because they are no longer accurate), as well as allele-dependent attributes like AC,AF, and AD for ( final VariantContext vc : VCs ) { if (vc.alleles.size() == 1) continue; @@ -607,7 +578,7 @@ public class VariantContextUtils { if ( ! genotypes.isEmpty() ) logger.debug(String.format("Stripping PLs at %s due incompatible alleles merged=%s vs. single=%s", genomeLocParser.createGenomeLoc(vc), alleles, vc.alleles)); - genotypes = stripPLs(genotypes); + genotypes = stripPLsAndAD(genotypes); // this will remove stale AC,AF attributed from vc calculateChromosomeCounts(vc, attributes, true); break; @@ -698,11 +669,11 @@ public class VariantContextUtils { return true; } - public static GenotypesContext stripPLs(GenotypesContext genotypes) { + public static GenotypesContext stripPLsAndAD(GenotypesContext genotypes) { GenotypesContext newGs = GenotypesContext.create(genotypes.size()); for ( final Genotype g : genotypes ) { - newGs.add(g.hasLikelihoods() ? removePLs(g) : g); + newGs.add(removePLsAndAD(g)); } return newGs; @@ -1369,10 +1340,7 @@ public class VariantContextUtils { public static VariantContext reverseTrimAlleles( final VariantContext inputVC ) { - // TODO - this function doesn't work with mixed records or records that started as mixed and then became non-mixed - // see whether we need to trim common reference base from all alleles - final int trimExtent = computeReverseClipping(inputVC.getAlleles(), inputVC.getReference().getDisplayString().getBytes(), 0, false); if ( trimExtent <= 0 || inputVC.getAlleles().size() <= 1 ) return inputVC; diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java index 913615a84..abe85e383 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantJEXLContext.java @@ -193,6 +193,8 @@ class JEXLMap implements Map { infoMap.put("isHet", g.isHet() ? "1" : "0"); infoMap.put("isHomVar", g.isHomVar() ? "1" : "0"); infoMap.put(VCFConstants.GENOTYPE_QUALITY_KEY, g.getGQ()); + if ( g.hasDP() ) + infoMap.put(VCFConstants.DEPTH_KEY, g.getDP()); for ( Map.Entry e : g.getExtendedAttributes().entrySet() ) { if ( e.getValue() != null && !e.getValue().equals(VCFConstants.MISSING_VALUE_v4) ) infoMap.put(e.getKey(), e.getValue()); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java index 5b81e7117..61c0129bb 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriter.java @@ -272,11 +272,7 @@ public abstract class BCF2FieldWriter { encodingType = BCF2Type.INT8; buildAlleleMap(vc); - nValuesPerGenotype = vc.getMaxPloidy(); - - // deal with the case where we have no call everywhere, in which case we write out diploid - if ( nValuesPerGenotype == -1 ) - nValuesPerGenotype = 2; + nValuesPerGenotype = vc.getMaxPloidy(2); super.start(encoder, vc); } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java index e4c64b26b..536f07f90 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2Writer.java @@ -32,7 +32,10 @@ import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Codec; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Type; import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils; import org.broadinstitute.sting.utils.codecs.bcf2.BCFVersion; -import org.broadinstitute.sting.utils.codecs.vcf.*; +import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.codecs.vcf.VCFContigHeaderLine; +import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader; +import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.variantcontext.*; @@ -345,10 +348,12 @@ class BCF2Writer extends IndexingVariantContextWriter { final BCF2FieldWriter.GenotypesWriter writer = fieldManager.getGenotypeFieldWriter(field); if ( writer == null ) errorUnexpectedFieldToWrite(vc, field, "FORMAT"); + assert writer != null; + writer.start(encoder, vc); for ( final String name : sampleNames ) { Genotype g = vc.getGenotype(name); - if ( g == null ) VCFWriter.missingSampleError(vc, header); + if ( g == null ) g = GenotypeBuilder.createMissing(name, writer.nValuesPerGenotype); writer.addGenotype(encoder, vc, g); } writer.done(encoder, vc); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java index db74f2263..9a987f161 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/VCFWriter.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.utils.variantcontext.writer; import net.sf.samtools.SAMSequenceDictionary; import org.broad.tribble.TribbleException; import org.broad.tribble.util.ParsingUtils; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -251,7 +250,7 @@ class VCFWriter extends IndexingVariantContextWriter { mWriter.write("\n"); mWriter.flush(); // necessary so that writing to an output stream will work } catch (IOException e) { - throw new RuntimeException("Unable to write the VCF object to " + getStreamName()); + throw new RuntimeException("Unable to write the VCF object to " + getStreamName(), e); } } @@ -339,13 +338,13 @@ class VCFWriter extends IndexingVariantContextWriter { */ private void addGenotypeData(VariantContext vc, Map alleleMap, List genotypeFormatKeys) throws IOException { + final int ploidy = vc.getMaxPloidy(2); + for ( String sample : mHeader.getGenotypeSamples() ) { mWriter.write(VCFConstants.FIELD_SEPARATOR); Genotype g = vc.getGenotype(sample); - if ( g == null ) { - missingSampleError(vc, mHeader); - } + if ( g == null ) g = GenotypeBuilder.createMissing(sample, ploidy); final List attrs = new ArrayList(genotypeFormatKeys.size()); for ( String field : genotypeFormatKeys ) { @@ -426,13 +425,6 @@ class VCFWriter extends IndexingVariantContextWriter { } } - public static final void missingSampleError(final VariantContext vc, final VCFHeader header) { - final List badSampleNames = new ArrayList(); - for ( final String x : header.getGenotypeSamples() ) - if ( ! vc.hasGenotype(x) ) badSampleNames.add(x); - throw new ReviewedStingException("BUG: we now require all samples in VCFheader to have genotype objects. Missing samples are " + Utils.join(",", badSampleNames)); - } - private boolean isMissingValue(String s) { // we need to deal with the case that it's a list of missing values return (countOccurrences(VCFConstants.MISSING_VALUE_v4.charAt(0), s) + countOccurrences(',', s) == s.length()); @@ -485,10 +477,10 @@ class VCFWriter extends IndexingVariantContextWriter { else if ( val instanceof List ) { result = formatVCFField(((List)val).toArray()); } else if ( val.getClass().isArray() ) { - int length = Array.getLength(val); + final int length = Array.getLength(val); if ( length == 0 ) return formatVCFField(null); - StringBuffer sb = new StringBuffer(formatVCFField(Array.get(val, 0))); + final StringBuilder sb = new StringBuilder(formatVCFField(Array.get(val, 0))); for ( int i = 1; i < length; i++) { sb.append(","); sb.append(formatVCFField(Array.get(val, i))); diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index 7e38c00f3..fa9f9e8a7 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -40,13 +40,13 @@ import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.variantcontext.VariantContextTestProvider; - -import java.io.*; - import org.testng.Assert; import org.testng.annotations.AfterSuite; import org.testng.annotations.BeforeMethod; +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.PrintStream; import java.text.SimpleDateFormat; import java.util.*; @@ -251,20 +251,43 @@ public class WalkerTest extends BaseTest { return false; } - protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { - return executeTest(name, spec, Arrays.asList(1, 4)); + public enum ParallelTestType { + TREE_REDUCIBLE, + NANO_SCHEDULED, + BOTH } - protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List parallelThreads) { + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec, ParallelTestType testType) { + final List ntThreads = testType == ParallelTestType.TREE_REDUCIBLE || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + final List cntThreads = testType == ParallelTestType.NANO_SCHEDULED || testType == ParallelTestType.BOTH ? Arrays.asList(1, 4) : Collections.emptyList(); + + return executeTest(name, spec, ntThreads, cntThreads); + } + + protected Pair, List> executeTestParallel(final String name, WalkerTestSpec spec) { + return executeTestParallel(name, spec, ParallelTestType.TREE_REDUCIBLE); + } + + protected Pair, List> executeTest(final String name, WalkerTestSpec spec, List ntThreads, List cpuThreads) { String originalArgs = spec.args; Pair, List> results = null; - for ( int nt : parallelThreads ) { + boolean ran1 = false; + for ( int nt : ntThreads ) { String extra = nt == 1 ? "" : (" -nt " + nt); + ran1 = ran1 || nt == 1; spec.args = originalArgs + extra; results = executeTest(name + "-nt-" + nt, spec); } + for ( int nct : cpuThreads ) { + if ( nct != 1 ) { + String extra = " -nct " + nct; + spec.args = originalArgs + extra; + results = executeTest(name + "-cnt-" + nct, spec); + } + } + return results; } diff --git a/public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java deleted file mode 100644 index a6af034cb..000000000 --- a/public/java/test/org/broadinstitute/sting/alignment/AlignerIntegrationTest.java +++ /dev/null @@ -1,27 +0,0 @@ -package org.broadinstitute.sting.alignment; - -import org.testng.annotations.Test; -import org.broadinstitute.sting.WalkerTest; - -import java.util.Arrays; - -/** - * Integration tests for the aligner. - * - * @author mhanna - * @version 0.1 - */ -public class AlignerIntegrationTest extends WalkerTest { - @Test - public void testBasicAlignment() { - String md5 = "a2bdf907b18114a86ca47f9fc23791bf"; - WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-R " + GATKDataLocation + "bwa/human_b36_both.fasta" + - " -T Align" + - " -I " + validationDataLocation + "NA12878_Pilot1_20.trimmed.unmapped.bam" + - " -o %s", - 1, // just one output file - Arrays.asList(md5)); - executeTest("testBasicAlignment", spec); - } -} diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java index 99d6b88f3..b1e788dc5 100644 --- a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSiteUnitTest.java @@ -39,7 +39,7 @@ public class ArgumentMatchSiteUnitTest { @Test public void testFile() { - ArgumentMatchSource source = new ArgumentMatchSource(new File("test")); + ArgumentMatchSource source = new ArgumentMatchFileSource(new File("test")); ArgumentMatchSite site = new ArgumentMatchSite(source, 1); Assert.assertEquals(site.getSource(), source); Assert.assertEquals(site.getIndex(), 1); diff --git a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java index 4bc7eb822..a183b2001 100644 --- a/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/commandline/ArgumentMatchSourceUnitTest.java @@ -35,15 +35,15 @@ public class ArgumentMatchSourceUnitTest extends BaseTest { public void testCommandLine() { ArgumentMatchSource source = ArgumentMatchSource.COMMAND_LINE; Assert.assertEquals(source.getType(), ArgumentMatchSourceType.CommandLine); - Assert.assertNull(source.getFile()); + Assert.assertNull(source.getDescription()); } @Test public void testFile() { File f = new File("test"); - ArgumentMatchSource source = new ArgumentMatchSource(f); - Assert.assertEquals(source.getType(), ArgumentMatchSourceType.File); - Assert.assertEquals(source.getFile(), f); + ArgumentMatchSource source = new ArgumentMatchFileSource(f); + Assert.assertEquals(source.getType(), ArgumentMatchSourceType.Provider); + Assert.assertEquals(source.getDescription(), "file " + f.getAbsolutePath()); } @Test(expectedExceptions = IllegalArgumentException.class) @@ -54,8 +54,8 @@ public class ArgumentMatchSourceUnitTest extends BaseTest { @Test public void testEquals() { ArgumentMatchSource cmdLine = ArgumentMatchSource.COMMAND_LINE; - ArgumentMatchSource fileA = new ArgumentMatchSource(new File("a")); - ArgumentMatchSource fileB = new ArgumentMatchSource(new File("b")); + ArgumentMatchSource fileA = new ArgumentMatchFileSource(new File("a")); + ArgumentMatchSource fileB = new ArgumentMatchFileSource(new File("b")); Assert.assertFalse(cmdLine.equals(null)); @@ -75,8 +75,8 @@ public class ArgumentMatchSourceUnitTest extends BaseTest { @Test public void testCompareTo() { ArgumentMatchSource cmdLine = ArgumentMatchSource.COMMAND_LINE; - ArgumentMatchSource fileA = new ArgumentMatchSource(new File("a")); - ArgumentMatchSource fileB = new ArgumentMatchSource(new File("b")); + ArgumentMatchSource fileA = new ArgumentMatchFileSource(new File("a")); + ArgumentMatchSource fileB = new ArgumentMatchFileSource(new File("b")); Assert.assertTrue(cmdLine.compareTo(cmdLine) == 0); Assert.assertTrue(cmdLine.compareTo(fileA) < 0); diff --git a/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java new file mode 100644 index 000000000..924c6ec5a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/commandline/InvalidArgumentIntegrationTest.java @@ -0,0 +1,41 @@ +package org.broadinstitute.sting.commandline; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.exceptions.UserException; + +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 8/31/12 + * Time: 11:03 AM + * To change this template use File | Settings | File Templates. + */ +public class InvalidArgumentIntegrationTest extends WalkerTest { + private static final String callsB36 = BaseTest.validationDataLocation + "lowpass.N3.chr1.raw.vcf"; + + private WalkerTest.WalkerTestSpec baseTest(String flag, String arg, Class exeption) { + return new WalkerTest.WalkerTestSpec("-T VariantsToTable -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s " + flag + " " + arg, + 1, exeption); + + } + + @Test + public void testUnknownReadFilter() { + executeTest("UnknownReadFilter",baseTest("-rf","TestUnknownReadFilter", UserException.MalformedReadFilterException.class)); + } + + @Test + public void testMalformedWalkerArgs() { + executeTest("MalformedWalkerArgs", + new WalkerTest.WalkerTestSpec("-T UnknownWalkerName -M 10 --variant:vcf " + + callsB36 + " -F POS,CHROM -R " + + b36KGReference + " -o %s ", + 1, UserException.MalformedWalkerArgumentsException.class)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java index 5c4db08bd..9483e4757 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/EngineFeaturesIntegrationTest.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk; import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.gatk.walkers.qc.ErrorThrowing; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.DataProvider; @@ -83,24 +84,30 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { private class EngineErrorHandlingTestProvider extends TestDataProvider { final Class expectedException; - final boolean multiThreaded; + final String args; final int iterationsToTest; - public EngineErrorHandlingTestProvider(Class exceptedException, final boolean multiThreaded) { + public EngineErrorHandlingTestProvider(Class exceptedException, final String args) { super(EngineErrorHandlingTestProvider.class); this.expectedException = exceptedException; - this.multiThreaded = multiThreaded; - this.iterationsToTest = multiThreaded ? 1000 : 1; - setName(String.format("Engine error handling: expected %s, is-multithreaded %b", exceptedException, multiThreaded)); + this.args = args; + this.iterationsToTest = args.equals("") ? 1 : 10; + setName(String.format("Engine error handling: expected %s with args %s", exceptedException, args)); } } @DataProvider(name = "EngineErrorHandlingTestProvider") public Object[][] makeEngineErrorHandlingTestProvider() { - for ( final boolean multiThreaded : Arrays.asList(true, false)) { - new EngineErrorHandlingTestProvider(NullPointerException.class, multiThreaded); - new EngineErrorHandlingTestProvider(UserException.class, multiThreaded); - new EngineErrorHandlingTestProvider(ReviewedStingException.class, multiThreaded); + for ( final ErrorThrowing.FailMethod failMethod : ErrorThrowing.FailMethod.values() ) { + if ( failMethod == ErrorThrowing.FailMethod.TREE_REDUCE ) + continue; // cannot reliably throw errors in TREE_REDUCE + + final String failArg = " -fail " + failMethod.name(); + for ( final String args : Arrays.asList("", " -nt 2", " -nct 2") ) { + new EngineErrorHandlingTestProvider(NullPointerException.class, failArg + args); + new EngineErrorHandlingTestProvider(UserException.class, failArg + args); + new EngineErrorHandlingTestProvider(ReviewedStingException.class, failArg + args); + } } return EngineErrorHandlingTestProvider.getTests(EngineErrorHandlingTestProvider.class); @@ -109,11 +116,11 @@ public class EngineFeaturesIntegrationTest extends WalkerTest { // // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type // - @Test(dataProvider = "EngineErrorHandlingTestProvider") + @Test(enabled = true, dataProvider = "EngineErrorHandlingTestProvider", timeOut = 60 * 1000 ) public void testEngineErrorHandlingTestProvider(final EngineErrorHandlingTestProvider cfg) { for ( int i = 0; i < cfg.iterationsToTest; i++ ) { final String root = "-T ErrorThrowing -R " + exampleFASTA; - final String args = root + (cfg.multiThreaded ? " -nt 2" : "") + " -E " + cfg.expectedException.getSimpleName(); + final String args = root + cfg.args + " -E " + cfg.expectedException.getSimpleName(); WalkerTestSpec spec = new WalkerTestSpec(args, 0, cfg.expectedException); executeTest(cfg.toString(), spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java new file mode 100644 index 000000000..6cfd7bf46 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/MaxRuntimeIntegrationTest.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; +import java.util.Collections; +import java.util.concurrent.TimeUnit; + +/** + * + */ +public class MaxRuntimeIntegrationTest extends WalkerTest { + private static final long STARTUP_TIME = TimeUnit.NANOSECONDS.convert(20, TimeUnit.SECONDS); + + private class MaxRuntimeTestProvider extends TestDataProvider { + final long maxRuntime; + final TimeUnit unit; + + public MaxRuntimeTestProvider(final long maxRuntime, final TimeUnit unit) { + super(MaxRuntimeTestProvider.class); + this.maxRuntime = maxRuntime; + this.unit = unit; + setName(String.format("Max runtime test : %d of %s", maxRuntime, unit)); + } + + public long expectedMaxRuntimeNano() { + return TimeUnit.NANOSECONDS.convert(maxRuntime, unit) + STARTUP_TIME; + } + } + + @DataProvider(name = "MaxRuntimeProvider") + public Object[][] makeMaxRuntimeProvider() { + for ( final TimeUnit requestedUnits : Arrays.asList(TimeUnit.NANOSECONDS, TimeUnit.MILLISECONDS, TimeUnit.SECONDS, TimeUnit.MINUTES) ) + new MaxRuntimeTestProvider(requestedUnits.convert(30, TimeUnit.SECONDS), requestedUnits); + + return MaxRuntimeTestProvider.getTests(MaxRuntimeTestProvider.class); + } + + // + // Loop over errors to throw, make sure they are the errors we get back from the engine, regardless of NT type + // + @Test(enabled = true, dataProvider = "MaxRuntimeProvider", timeOut = 60 * 1000) + public void testMaxRuntime(final MaxRuntimeTestProvider cfg) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + hg18Reference + + " -I " + validationDataLocation + "NA12878.WEx.downsampled20x.bam -o /dev/null" + + " -maxRuntime " + cfg.maxRuntime + " -maxRuntimeUnits " + cfg.unit, 0, + Collections.emptyList()); + final SimpleTimer timer = new SimpleTimer().start(); + executeTest("Max runtime " + cfg, spec); + final long actualRuntimeNano = timer.getElapsedTimeNano(); + + Assert.assertTrue(actualRuntimeNano < cfg.expectedMaxRuntimeNano(), + "Actual runtime " + TimeUnit.SECONDS.convert(actualRuntimeNano, TimeUnit.NANOSECONDS) + + " exceeded max. tolerated runtime " + TimeUnit.SECONDS.convert(cfg.expectedMaxRuntimeNano(), TimeUnit.NANOSECONDS) + + " given requested runtime " + cfg.maxRuntime + " " + cfg.unit); + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java index 41bdda0e0..eaa098793 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReadBasedReferenceOrderedViewUnitTest.java @@ -1,207 +1,364 @@ /* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ +* Copyright (c) 2010. The Broad Institute +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +* OTHER DEALINGS IN THE SOFTWARE. +*/ package org.broadinstitute.sting.gatk.datasources.providers; +import net.sf.picard.util.PeekableIterator; import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import net.sf.samtools.SAMSequenceDictionary; -import org.testng.Assert; +import org.broad.tribble.BasicFeature; +import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTrackerUnitTest; +import org.broadinstitute.sting.commandline.RodBinding; +import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; - -import org.testng.annotations.BeforeMethod; - +import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.*; - /** - * @author aaron - *

    - * Class ReadBasedReferenceOrderedViewUnitTest - *

    - * test out the ReadBasedReferenceOrderedView class + * @author depristo */ public class ReadBasedReferenceOrderedViewUnitTest extends BaseTest { - private GenomeLocParser genomeLocParser; - private static int startingChr = 1; private static int endingChr = 2; private static int readCount = 100; private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; + private static String contig; private static SAMFileHeader header; + private GenomeLocParser genomeLocParser; + @BeforeClass public void beforeClass() { header = ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); + contig = header.getSequence(0).getSequenceName(); genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + + initializeTests(); } - @BeforeMethod - public void beforeEach() { - } - - @Test - public void testCreateReadMetaDataTrackerOnePerSite() { - // make ten reads, - List records = new ArrayList(); - for (int x = 1; x < 11; x++) { - SAMRecord rec = ArtificialSAMUtils.createArtificialRead(header, "name", 0, x, 10); + private class CompareFeatures implements Comparator { + @Override + public int compare(Feature o1, Feature o2) { + return genomeLocParser.createGenomeLoc(o1).compareTo(genomeLocParser.createGenomeLoc(o2)); } - GenomeLoc start = genomeLocParser.createGenomeLoc(header.getSequenceDictionary().getSequence(0).getSequenceName(), 0, 0); - List list = new ArrayList(); - list.add(new RMDDataState(null, new FakePeekingRODIterator(genomeLocParser,start, "fakeName"))); - ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(new WindowedData(list)); + } - for (SAMRecord rec : records) { - ReadMetaDataTracker tracker = view.getReferenceOrderedDataForRead(rec); - Map> map = tracker.getReadOffsetMapping(); - for (Integer i : map.keySet()) { - Assert.assertEquals(map.get(i).size(), 1); + private class ReadMetaDataTrackerRODStreamTest extends TestDataProvider { + final List allFeatures; + final List intervals; + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final GenomeLoc interval) { + this(allFeatures, Collections.singletonList(interval)); + } + + public ReadMetaDataTrackerRODStreamTest(final List allFeatures, final List intervals) { + super(ReadMetaDataTrackerRODStreamTest.class); + this.allFeatures = new ArrayList(allFeatures); + Collections.sort(this.allFeatures, new CompareFeatures()); + this.intervals = new ArrayList(intervals); + Collections.sort(this.intervals); + setName(String.format("%s nFeatures %d intervals %s", getClass().getSimpleName(), allFeatures.size(), + intervals.size() == 1 ? intervals.get(0) : "size " + intervals.size())); + } + + public PeekableIterator getIterator(final String name) { + return new PeekableIterator(new TribbleIteratorFromCollection(name, genomeLocParser, allFeatures)); + } + + public Set getExpectedOverlaps(final GenomeLoc interval) { + final Set overlapping = new HashSet(); + for ( final Feature f : allFeatures ) + if ( genomeLocParser.createGenomeLoc(f).overlapsP(interval) ) + overlapping.add(f); + return overlapping; + } + } + + public void initializeTests() { + final List handPickedFeatures = new ArrayList(); + + handPickedFeatures.add(new BasicFeature(contig, 1, 1)); + handPickedFeatures.add(new BasicFeature(contig, 2, 5)); + handPickedFeatures.add(new BasicFeature(contig, 4, 4)); + handPickedFeatures.add(new BasicFeature(contig, 6, 6)); + handPickedFeatures.add(new BasicFeature(contig, 9, 10)); + handPickedFeatures.add(new BasicFeature(contig, 10, 10)); + handPickedFeatures.add(new BasicFeature(contig, 10, 11)); + handPickedFeatures.add(new BasicFeature(contig, 13, 20)); + + createTestsForFeatures(handPickedFeatures); + + // test in the present of a large spanning element + { + List oneLargeSpan = new ArrayList(handPickedFeatures); + oneLargeSpan.add(new BasicFeature(contig, 1, 30)); + createTestsForFeatures(oneLargeSpan); + } + + // test in the presence of a partially spanning element + { + List partialSpanStart = new ArrayList(handPickedFeatures); + partialSpanStart.add(new BasicFeature(contig, 1, 6)); + createTestsForFeatures(partialSpanStart); + } + + // test in the presence of a partially spanning element at the end + { + List partialSpanEnd = new ArrayList(handPickedFeatures); + partialSpanEnd.add(new BasicFeature(contig, 10, 30)); + createTestsForFeatures(partialSpanEnd); + } + + // no data at all + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, 5, 5); + new ReadMetaDataTrackerRODStreamTest(Collections.emptyList(), loc); + } + + // -------------------------------------------------------------------------------- + // + // tests for the lower level IntervalOverlappingRODsFromStream + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerRODStreamTest") + public Object[][] createReadMetaDataTrackerRODStreamTest() { + return ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + } + + private GenomeLoc span(final List features) { + int featuresStart = 1; for ( final GenomeLoc f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final GenomeLoc f : features ) featuresStop = Math.max(featuresStop, f.getStop()); + return genomeLocParser.createGenomeLoc(contig, featuresStart, featuresStop); + } + + private void createTestsForFeatures(final List features) { + int featuresStart = 1; for ( final Feature f : features ) featuresStart = Math.min(featuresStart, f.getStart()); + int featuresStop = 1; for ( final Feature f : features ) featuresStop = Math.max(featuresStop, f.getEnd()); + + for ( final int size : Arrays.asList(1, 5, 10, 100) ) { + final List allIntervals = new ArrayList(); + // regularly spaced + for ( int start = featuresStart; start < featuresStop; start++) { + final GenomeLoc loc = genomeLocParser.createGenomeLoc(contig, start, start + size - 1); + allIntervals.add(loc); + new ReadMetaDataTrackerRODStreamTest(features, loc); } - Assert.assertEquals(map.keySet().size(), 10); + + // starting and stopping at every feature + for ( final Feature f : features ) { + // just at the feature + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart(), f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // up to end + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd())); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // missing by 1 + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() + 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + + // just spanning + allIntervals.add(genomeLocParser.createGenomeLoc(contig, f.getStart() - 1, f.getEnd() + 1)); + new ReadMetaDataTrackerRODStreamTest(features, allIntervals.get(allIntervals.size() - 1)); + } + + new ReadMetaDataTrackerRODStreamTest(features, allIntervals); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest") + public void runReadMetaDataTrackerRODStreamTest_singleQuery(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() == 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, Collections.singletonList(data.intervals.get(0))); + } + } + + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerRODStreamTest", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_singleQuery") + public void runReadMetaDataTrackerRODStreamTest_multipleQueries(final ReadMetaDataTrackerRODStreamTest data) { + if ( data.intervals.size() > 1 ) { + final String name = "testName"; + final PeekableIterator iterator = data.getIterator(name); + final IntervalOverlappingRODsFromStream stream = new IntervalOverlappingRODsFromStream(name, iterator); + testRODStream(data, stream, data.intervals); + } + } + + private void testRODStream(final ReadMetaDataTrackerRODStreamTest test, final IntervalOverlappingRODsFromStream stream, final List intervals) { + for ( final GenomeLoc interval : intervals ) { + final RODRecordList query = stream.getOverlapping(interval); + final HashSet queryFeatures = new HashSet(); + for ( final GATKFeature f : query ) queryFeatures.add((Feature)f.getUnderlyingObject()); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } + } + + // -------------------------------------------------------------------------------- + // + // tests for the higher level tracker itself + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "ReadMetaDataTrackerTests") + public Object[][] createTrackerTests() { + List tests = new ArrayList(); + + final Object[][] singleTests = ReadMetaDataTrackerRODStreamTest.getTests(ReadMetaDataTrackerRODStreamTest.class); + final List multiSiteTests = new ArrayList(); + for ( final Object[] singleTest : singleTests ) { + if ( ((ReadMetaDataTrackerRODStreamTest)singleTest[0]).intervals.size() > 1 ) + multiSiteTests.add((ReadMetaDataTrackerRODStreamTest)singleTest[0]); } + for ( final boolean testStateless : Arrays.asList(true, false) ) { + // all pairwise tests + for ( List singleTest : Utils.makePermutations(multiSiteTests, 2, false)) { + tests.add(new Object[]{singleTest, testStateless}); + } + + // all 3 way pairwise tests + //for ( List singleTest : Utils.makePermutations(multiSiteTests, 3, false)) { + // tests.add(new Object[]{singleTest, testStateless}); + //} + } + + logger.warn("Creating " + tests.size() + " tests for ReadMetaDataTrackerTests"); + return tests.toArray(new Object[][]{}); } -} + @Test(enabled = true, dataProvider = "ReadMetaDataTrackerTests", dependsOnMethods = "runReadMetaDataTrackerRODStreamTest_multipleQueries") + public void runReadMetaDataTrackerTest(final List RODs, final boolean testStateless) { + final List names = new ArrayList(); + final List> iterators = new ArrayList>(); + final List intervals = new ArrayList(); + final List> rodBindings = new ArrayList>(); + for ( int i = 0; i < RODs.size(); i++ ) { + final RodBinding rodBinding = new RodBinding(Feature.class, "name"+i); + rodBindings.add(rodBinding); + final String name = rodBinding.getName(); + names.add(name); + iterators.add(RODs.get(i).getIterator(name)); + intervals.addAll(RODs.get(i).intervals); + } -class FakePeekingRODIterator implements LocationAwareSeekableRODIterator { - private GenomeLocParser genomeLocParser; + Collections.sort(intervals); + final GenomeLoc span = span(intervals); + final ReadBasedReferenceOrderedView view = new ReadBasedReferenceOrderedView(genomeLocParser, span, names, iterators); - // current location - private GenomeLoc location; - private GATKFeature curROD; - private final String name; + if ( testStateless ) { + // test each tracker is well formed, as each is created + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + testMetaDataTrackerBindings(tracker, interval, RODs, rodBindings); + } + } else { + // tests all trackers are correct after reading them into an array + // this checks that the trackers are be safely stored away and analyzed later (critical for nano-scheduling) + final List trackers = new ArrayList(); + for ( final GenomeLoc interval : intervals ) { + final RefMetaDataTracker tracker = view.getReferenceOrderedDataForInterval(interval); + trackers.add(tracker); + } - public FakePeekingRODIterator(GenomeLocParser genomeLocParser, GenomeLoc startingLoc, String name) { - this.name = name; - this.location = genomeLocParser.createGenomeLoc(startingLoc.getContig(), startingLoc.getStart() + 1, startingLoc.getStop() + 1); + for ( int i = 0; i < trackers.size(); i++) { + testMetaDataTrackerBindings(trackers.get(i), intervals.get(i), RODs, rodBindings); + } + } } - /** - * Gets the header associated with the backing input stream. - * @return the ROD header. - */ - @Override - public Object getHeader() { - return null; + private void testMetaDataTrackerBindings(final RefMetaDataTracker tracker, + final GenomeLoc interval, + final List RODs, + final List> rodBindings) { + for ( int i = 0; i < RODs.size(); i++ ) { + final ReadMetaDataTrackerRODStreamTest test = RODs.get(i); + final List queryFeaturesList = tracker.getValues(rodBindings.get(i)); + final Set queryFeatures = new HashSet(queryFeaturesList); + final Set overlaps = test.getExpectedOverlaps(interval); + + Assert.assertEquals(queryFeatures.size(), overlaps.size(), "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected size = " + overlaps.size() + " but saw " + queryFeatures.size()); + + BaseTest.assertEqualsSet(queryFeatures, overlaps, "IntervalOverlappingRODsFromStream didn't return the expected set of overlapping features." + + " Expected = " + Utils.join(",", overlaps) + " but saw " + Utils.join(",", queryFeatures)); + } } - /** - * Gets the sequence dictionary associated with the backing input stream. - * @return sequence dictionary from the ROD header. - */ - @Override - public SAMSequenceDictionary getSequenceDictionary() { - return null; - } + static class TribbleIteratorFromCollection implements Iterator { + // current location + private final String name; + final Queue gatkFeatures; + public TribbleIteratorFromCollection(final String name, final GenomeLocParser genomeLocParser, final List features) { + this.name = name; - @Override - public GenomeLoc peekNextLocation() { - System.err.println("Peek Next -> " + location); - return location; - } + this.gatkFeatures = new LinkedList(); + for ( final Feature f : features ) + gatkFeatures.add(new GATKFeature.TribbleGATKFeature(genomeLocParser, f, name)); + } - @Override - public GenomeLoc position() { - return location; - } + @Override + public boolean hasNext() { + return ! gatkFeatures.isEmpty(); + } - @Override - public RODRecordList seekForward(GenomeLoc interval) { - while (location.isBefore(interval)) - next(); - return next(); // we always move by one, we know the next location will be right - } + @Override + public RODRecordList next() { + final GATKFeature first = gatkFeatures.poll(); + final Collection myFeatures = new LinkedList(); + myFeatures.add(first); + while ( gatkFeatures.peek() != null && gatkFeatures.peek().getLocation().getStart() == first.getStart() ) + myFeatures.add(gatkFeatures.poll()); - @Override - public boolean hasNext() { - return true; // we always have next - } + GenomeLoc loc = first.getLocation(); + for ( final GATKFeature feature : myFeatures ) + loc = loc.merge(feature.getLocation()); - @Override - public RODRecordList next() { - System.err.println("Next -> " + location); - curROD = new ReadMetaDataTrackerUnitTest.FakeRODatum(location, name); - location = genomeLocParser.createGenomeLoc(location.getContig(), location.getStart() + 1, location.getStop() + 1); - FakeRODRecordList list = new FakeRODRecordList(); - list.add(curROD); - return list; - } + return new RODRecordListImpl(name, myFeatures, loc); // is this safe? + } - @Override - public void remove() { - throw new IllegalStateException("GRRR"); - } - - @Override - public void close() { - // nothing to do + @Override public void remove() { throw new IllegalStateException("GRRR"); } } } -class FakeRODRecordList extends AbstractList implements RODRecordList { - private final List list = new ArrayList(); - public boolean add(GATKFeature data) { - return list.add(data); - } - - @Override - public GATKFeature get(int i) { - return list.get(i); - } - - @Override - public int size() { - return list.size(); - } - - @Override - public GenomeLoc getLocation() { - return list.get(0).getLocation(); - } - - @Override - public String getName() { - return "test"; - } - - @Override - public int compareTo(RODRecordList rodRecordList) { - return this.list.get(0).getLocation().compareTo(rodRecordList.getLocation()); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java index d75beae23..11a7b4cf7 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/providers/ReferenceOrderedViewUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.datasources.providers; import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; @@ -89,7 +90,7 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); - TableFeature datum = tracker.getFirstValue(TableFeature.class, "tableTest"); + TableFeature datum = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest")); Assert.assertEquals(datum.get("COL1"),"C","datum parameter for COL1 is incorrect"); Assert.assertEquals(datum.get("COL2"),"D","datum parameter for COL2 is incorrect"); @@ -115,13 +116,13 @@ public class ReferenceOrderedViewUnitTest extends BaseTest { ReferenceOrderedView view = new ManagingReferenceOrderedView( provider ); RefMetaDataTracker tracker = view.getReferenceOrderedDataAtLocus(genomeLocParser.createGenomeLoc("chrM",20), null); - TableFeature datum1 = tracker.getFirstValue(TableFeature.class, "tableTest1"); + TableFeature datum1 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest1")); Assert.assertEquals(datum1.get("COL1"),"C","datum1 parameter for COL1 is incorrect"); Assert.assertEquals(datum1.get("COL2"),"D","datum1 parameter for COL2 is incorrect"); Assert.assertEquals(datum1.get("COL3"),"E","datum1 parameter for COL3 is incorrect"); - TableFeature datum2 = tracker.getFirstValue(TableFeature.class, "tableTest2"); + TableFeature datum2 = tracker.getFirstValue(new RodBinding(TableFeature.class, "tableTest2")); Assert.assertEquals(datum2.get("COL1"),"C","datum2 parameter for COL1 is incorrect"); Assert.assertEquals(datum2.get("COL2"),"D","datum2 parameter for COL2 is incorrect"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java index 477b76e37..61c1c51b4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/DownsamplerBenchmark.java @@ -26,18 +26,20 @@ package org.broadinstitute.sting.gatk.datasources.reads; import com.google.caliper.Param; import net.sf.picard.filter.FilteringIterator; +import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.DownsamplingMethod; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; import org.broadinstitute.sting.gatk.iterators.LocusIteratorByState; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; +import org.broadinstitute.sting.gatk.walkers.qc.CountLoci; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.baq.BAQ; import java.util.Collections; import java.util.Iterator; @@ -69,18 +71,16 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { for(int i = 0; i < reps; i++) { SAMFileReader reader = new SAMFileReader(inputFile); ReadProperties readProperties = new ReadProperties(Collections.singletonList(new SAMReaderID(inputFile,new Tags())), - reader.getFileHeader(), - false, - SAMFileReader.ValidationStringency.SILENT, - downsampling.create(), - new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), - Collections.emptyList(), - false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR - (byte)0); + reader.getFileHeader(), + SAMFileHeader.SortOrder.coordinate, + false, + SAMFileReader.ValidationStringency.SILENT, + downsampling.create(), + new ValidationExclusion(Collections.singletonList(ValidationExclusion.TYPE.ALL)), + Collections.emptyList(), + Collections.emptyList(), + false, + (byte)0); GenomeLocParser genomeLocParser = new GenomeLocParser(reader.getFileHeader().getSequenceDictionary()); // Filter unmapped reads. TODO: is this always strictly necessary? Who in the GATK normally filters these out? @@ -100,7 +100,7 @@ public class DownsamplerBenchmark extends ReadProcessingBenchmark { }, PER_SAMPLE { @Override - DownsamplingMethod create() { return GATKArgumentCollection.getDefaultDownsamplingMethod(); } + DownsamplingMethod create() { return DownsamplingMethod.getDefaultDownsamplingMethod(new CountLoci(), false); } }; abstract DownsamplingMethod create(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java new file mode 100644 index 000000000..0807f36dc --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/ExperimentalReadShardBalancerUnitTest.java @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.*; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; + +public class ExperimentalReadShardBalancerUnitTest extends BaseTest { + + /** + * Tests to ensure that ExperimentalReadShardBalancer works as expected and does not place shard boundaries + * at inappropriate places, such as within an alignment start position + */ + private static class ExperimentalReadShardBalancerTest extends TestDataProvider { + private int numContigs; + private int numStacksPerContig; + private int stackSize; + private int numUnmappedReads; + private DownsamplingMethod downsamplingMethod; + private int expectedReadCount; + + private SAMFileHeader header; + private SAMReaderID testBAM; + + public ExperimentalReadShardBalancerTest( int numContigs, + int numStacksPerContig, + int stackSize, + int numUnmappedReads, + int downsamplingTargetCoverage ) { + super(ExperimentalReadShardBalancerTest.class); + + this.numContigs = numContigs; + this.numStacksPerContig = numStacksPerContig; + this.stackSize = stackSize; + this.numUnmappedReads = numUnmappedReads; + + this.downsamplingMethod = new DownsamplingMethod(DownsampleType.BY_SAMPLE, downsamplingTargetCoverage, null, true); + this.expectedReadCount = Math.min(stackSize, downsamplingTargetCoverage) * numStacksPerContig * numContigs + numUnmappedReads; + + setName(String.format("%s: numContigs=%d numStacksPerContig=%d stackSize=%d numUnmappedReads=%d downsamplingTargetCoverage=%d", + getClass().getSimpleName(), numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage)); + } + + public void run() { + createTestBAM(); + + SAMDataSource dataSource = new SAMDataSource(Arrays.asList(testBAM), + new ThreadAllocation(), + null, + new GenomeLocParser(header.getSequenceDictionary()), + false, + SAMFileReader.ValidationStringency.SILENT, + ReadShard.DEFAULT_MAX_READS, // reset ReadShard.MAX_READS to ReadShard.DEFAULT_MAX_READS for each test + downsamplingMethod, + new ValidationExclusion(), + new ArrayList(), + false); + + Iterable shardIterator = dataSource.createShardIteratorOverAllReads(new ExperimentalReadShardBalancer()); + + SAMRecord readAtEndOfLastShard = null; + int totalReadsSeen = 0; + + for ( Shard shard : shardIterator ) { + int numContigsThisShard = 0; + SAMRecord lastRead = null; + + for ( SAMRecord read : shard.iterator() ) { + totalReadsSeen++; + + if ( lastRead == null ) { + numContigsThisShard = 1; + } + else if ( ! read.getReadUnmappedFlag() && ! lastRead.getReferenceIndex().equals(read.getReferenceIndex()) ) { + numContigsThisShard++; + } + + // If the last read from the previous shard is not unmapped, we have to make sure + // that no reads in this shard start at the same position + if ( readAtEndOfLastShard != null && ! readAtEndOfLastShard.getReadUnmappedFlag() ) { + Assert.assertFalse(readAtEndOfLastShard.getReferenceIndex().equals(read.getReferenceIndex()) && + readAtEndOfLastShard.getAlignmentStart() == read.getAlignmentStart(), + String.format("Reads from alignment start position %d:%d are split across multiple shards", + read.getReferenceIndex(), read.getAlignmentStart())); + } + + lastRead = read; + } + + // There should never be reads from more than 1 contig in a shard (ignoring unmapped reads) + Assert.assertTrue(numContigsThisShard == 1, "found a shard with reads from multiple contigs"); + + readAtEndOfLastShard = lastRead; + } + + Assert.assertEquals(totalReadsSeen, expectedReadCount, "did not encounter the expected number of reads"); + } + + private void createTestBAM() { + header = ArtificialSAMUtils.createArtificialSamHeader(numContigs, 1, 100000); + SAMReadGroupRecord readGroup = new SAMReadGroupRecord("foo"); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + ArtificialSingleSampleReadStream artificialReads = new ArtificialSingleSampleReadStream(header, + "foo", + numContigs, + numStacksPerContig, + stackSize, + stackSize, + 1, + 100, + 50, + 150, + numUnmappedReads); + + File testBAMFile; + try { + testBAMFile = File.createTempFile("SAMDataSourceFillShardBoundaryTest", ".bam"); + testBAMFile.deleteOnExit(); + } + catch ( IOException e ) { + throw new ReviewedStingException(String.format("Failed to create temp bam file for test %s. %s", this, e.getMessage())); + } + + SAMFileWriter bamWriter = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(header, true, testBAMFile); + for ( SAMRecord read : artificialReads ) { + bamWriter.addAlignment(read); + } + bamWriter.close(); + + testBAM = new SAMReaderID(testBAMFile, new Tags()); + + new File(testBAM.getSamFilePath().replace(".bam", ".bai")).deleteOnExit(); + new File(testBAM.getSamFilePath() + ".bai").deleteOnExit(); + } + } + + @DataProvider(name = "ExperimentalReadShardBalancerTestDataProvider") + public Object[][] createExperimentalReadShardBalancerTests() { + for ( int numContigs = 1; numContigs <= 3; numContigs++ ) { + for ( int numStacksPerContig : Arrays.asList(1, 2, 4) ) { + // Use crucial read shard boundary values as the stack sizes + for ( int stackSize : Arrays.asList(ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS / 2 + 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS - 1, ReadShard.DEFAULT_MAX_READS + 1, ReadShard.DEFAULT_MAX_READS * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, ReadShard.DEFAULT_MAX_READS / 2, ReadShard.DEFAULT_MAX_READS * 2) ) { + // The first value will result in no downsampling at all, the others in some downsampling + for ( int downsamplingTargetCoverage : Arrays.asList(ReadShard.DEFAULT_MAX_READS * 10, ReadShard.DEFAULT_MAX_READS, ReadShard.DEFAULT_MAX_READS / 2) ) { + new ExperimentalReadShardBalancerTest(numContigs, numStacksPerContig, stackSize, numUnmappedReads, downsamplingTargetCoverage); + } + } + } + } + } + + return ExperimentalReadShardBalancerTest.getTests(ExperimentalReadShardBalancerTest.class); + } + + @Test(dataProvider = "ExperimentalReadShardBalancerTestDataProvider") + public void runExperimentalReadShardBalancerTest( ExperimentalReadShardBalancerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java index 66585c872..1dd4854cd 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/GATKWalkerBenchmark.java @@ -31,7 +31,7 @@ import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter; -import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -123,7 +123,7 @@ class CountBasesInReadPerformanceWalker extends ReadWalker { private long Gs; private long Ts; - public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker tracker) { + public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker tracker) { for(byte base: read.getReadBases()) { switch(base) { case 'A': As++; break; diff --git a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java index f2c546317..0ed485cd2 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSourceUnitTest.java @@ -24,39 +24,32 @@ package org.broadinstitute.sting.gatk.datasources.reads; -import static org.testng.Assert.assertEquals; -import static org.testng.Assert.assertTrue; -import static org.testng.Assert.fail; import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMProgramRecord; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.baq.BAQ; -import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.annotations.AfterMethod; import org.testng.annotations.BeforeMethod; - import org.testng.annotations.Test; import java.io.File; import java.io.FileNotFoundException; import java.util.ArrayList; -import java.util.Iterator; +import java.util.Collections; import java.util.List; +import static org.testng.Assert.*; + /** - * @author aaron - * @version 1.0 - * @date Apr 8, 2009 *

    * Class SAMDataSourceUnitTest *

    @@ -64,6 +57,8 @@ import java.util.List; */ public class SAMDataSourceUnitTest extends BaseTest { + // TODO: These legacy tests should really be replaced with a more comprehensive suite of tests for SAMDataSource + private List readers; private IndexedFastaSequenceFile seq; private GenomeLocParser genomeLocParser; @@ -183,11 +178,8 @@ public class SAMDataSourceUnitTest extends BaseTest { null, new ValidationExclusion(), new ArrayList(), + Collections.emptyList(), false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1, removeProgramRecords); @@ -205,11 +197,8 @@ public class SAMDataSourceUnitTest extends BaseTest { null, new ValidationExclusion(), new ArrayList(), + Collections.emptyList(), false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1, removeProgramRecords); diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java index b0de78b97..b0a8ff065 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/DownsamplingReadsIteratorUnitTest.java @@ -1,73 +1,138 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.testng.Assert; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.Collection; +import java.util.Arrays; -public class DownsamplingReadsIteratorUnitTest { +public class DownsamplingReadsIteratorUnitTest extends BaseTest { - @Test - public void testDownsamplingIteratorWithPositionalDownsampling() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + private static class DownsamplingReadsIteratorTest extends TestDataProvider { + private DownsamplingReadsIterator downsamplingIter; + private int targetCoverage; + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; - Collection reads = new ArrayList(); + public DownsamplingReadsIteratorTest( ArtificialSingleSampleReadStream stream, int targetCoverage ) { + super(DownsamplingReadsIteratorTest.class); - reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 1, 100)); - reads.addAll(createStackOfIdenticalReads(3000, header, "foo", 0, 50, 100)); + this.stream = stream; + this.targetCoverage = targetCoverage; - StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); - - Assert.assertTrue(iter.hasNext()); - SAMRecord previous = iter.next(); - int count = 1; - - while ( iter.hasNext() ) { - SAMRecord current = iter.next(); - Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); - count++; - previous = current; + setName(String.format("%s: targetCoverage=%d numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + targetCoverage, + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); } - Assert.assertEquals(count, 1000); + public void run() { + streamAnalyzer = new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(stream, targetCoverage); + downsamplingIter = new DownsamplingReadsIterator(stream.getStingSAMIterator(), new SimplePositionalDownsampler(targetCoverage)); + + streamAnalyzer.analyze(downsamplingIter); + + // Check whether the observed properties of the downsampled stream are what they should be + streamAnalyzer.validate(); + + // Allow memory used by this test to be reclaimed + stream = null; + streamAnalyzer = null; + downsamplingIter = null; + } } - @Test - public void testDownsamplingIteratorNoEffectiveDownsampling() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + @DataProvider(name = "DownsamplingReadsIteratorTestDataProvider") + public Object[][] createDownsamplingReadsIteratorTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(5, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); - Collection reads = new ArrayList(); + // Values that don't vary across tests + int targetCoverage = 10; + int minReadLength = 50; + int maxReadLength = 100; + int minDistanceBetweenStacks = 1; + int maxDistanceBetweenStacks = maxReadLength + 1; - reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); - reads.addAll(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); + GenomeAnalysisEngine.resetRandomGenerator(); - StingSAMIterator iter = new DownsamplingReadsIterator(StingSAMIteratorAdapter.adapt(reads.iterator()), new PositionalDownsampler(1000)); - - Assert.assertTrue(iter.hasNext()); - SAMRecord previous = iter.next(); - int count = 1; - - while ( iter.hasNext() ) { - SAMRecord current = iter.next(); - Assert.assertTrue(previous.getAlignmentStart() <= current.getAlignmentStart() || ! previous.getReferenceIndex().equals(current.getReferenceIndex())); - count++; - previous = current; + // brute force testing! + for ( int numContigs : Arrays.asList(1, 2, 5) ) { + for ( int stacksPerContig : Arrays.asList(1, 2, 10) ) { + for ( int minReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int maxReadsPerStack : Arrays.asList(1, targetCoverage / 2, targetCoverage, targetCoverage - 1, targetCoverage + 1, targetCoverage * 2) ) { + for ( int numUnmappedReads : Arrays.asList(0, 1, targetCoverage, targetCoverage * 2) ) { + // Only interested in sane read stream configurations here + if ( minReadsPerStack <= maxReadsPerStack ) { + new DownsamplingReadsIteratorTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads), + targetCoverage); + } + } + } + } + } } - Assert.assertEquals(count, 600); + return DownsamplingReadsIteratorTest.getTests(DownsamplingReadsIteratorTest.class); } - private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - ArrayList stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; + @Test(dataProvider = "DownsamplingReadsIteratorTestDataProvider") + public void runDownsamplingReadsIteratorTest( DownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java index 0f4bae555..3bf1096b1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/FractionalDownsamplerUnitTest.java @@ -1,65 +1,157 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.downsampling; import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import org.testng.Assert; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; import java.util.List; -public class FractionalDownsamplerUnitTest { +public class FractionalDownsamplerUnitTest extends BaseTest { - @Test - public void test100PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(1.0); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + private static class FractionalDownsamplerTest extends TestDataProvider { + double fraction; + int totalReads; + int expectedMinNumReadsAfterDownsampling; + int expectedMaxNumReadsAfterDownsampling; + int expectedMinDiscardedItems; + int expectedMaxDiscardedItems; - downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); + private static final double EXPECTED_ACCURACY = 0.05; // should be accurate to within +/- this percent - List downsampledReads = downsampler.consumeDownsampledItems(); + public FractionalDownsamplerTest( double fraction, int totalReads ) { + super(FractionalDownsamplerTest.class); - Assert.assertTrue(downsampledReads.size() == 1000); - } + this.fraction = fraction; + this.totalReads = totalReads; - @Test - public void test0PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(0.0); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + calculateExpectations(); - downsampler.submit(createRandomReads(1000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.isEmpty()); - } - - @Test - public void test50PercentInclusion() { - FractionalDownsampler downsampler = new FractionalDownsampler(0.5); - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - downsampler.submit(createRandomReads(5000, header, "foo", 0, 100000, 500)); - downsampler.signalEndOfInput(); - - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.size() >= 2000 && downsampledReads.size() <= 3000); - } - - private List createRandomReads( int numReads, SAMFileHeader header, String name, int contigIndex, int maxAlignmentStart, int maxLength ) { - List reads = new ArrayList(numReads); - - for ( int i = 1; i <= numReads; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, name, contigIndex, - GenomeAnalysisEngine.getRandomGenerator().nextInt(maxAlignmentStart) + 1, - GenomeAnalysisEngine.getRandomGenerator().nextInt(maxLength) + 1)); + setName(String.format("%s: fraction=%.2f totalReads=%d expectedMinNumReadsAfterDownsampling=%d expectedMaxNumReadsAfterDownsampling=%d", + getClass().getSimpleName(), fraction, totalReads, expectedMinNumReadsAfterDownsampling, expectedMaxNumReadsAfterDownsampling)); } - return reads; + private void calculateExpectations() { + // Require an exact match in the 0% and 100% cases + if ( fraction == 0.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = 0; + expectedMinDiscardedItems = expectedMaxDiscardedItems = totalReads; + } + else if ( fraction == 1.0 ) { + expectedMinNumReadsAfterDownsampling = expectedMaxNumReadsAfterDownsampling = totalReads; + expectedMinDiscardedItems = expectedMaxDiscardedItems = 0; + } + else { + expectedMinNumReadsAfterDownsampling = Math.max((int)((fraction - EXPECTED_ACCURACY) * totalReads), 0); + expectedMaxNumReadsAfterDownsampling = Math.min((int) ((fraction + EXPECTED_ACCURACY) * totalReads), totalReads); + expectedMinDiscardedItems = totalReads - expectedMaxNumReadsAfterDownsampling; + expectedMaxDiscardedItems = totalReads - expectedMinNumReadsAfterDownsampling; + } + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "FractionalDownsamplerTestDataProvider") + public Object[][] createFractionalDownsamplerTestData() { + for ( double fraction : Arrays.asList(0.0, 0.25, 0.5, 0.75, 1.0) ) { + for ( int totalReads : Arrays.asList(0, 1000, 10000) ) { + new FractionalDownsamplerTest(fraction, totalReads); + } + } + + return FractionalDownsamplerTest.getTests(FractionalDownsamplerTest.class); + } + + @Test(dataProvider = "FractionalDownsamplerTestDataProvider") + public void runFractionalDownsamplerTest( FractionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new FractionalDownsampler(test.fraction); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + if ( test.fraction > FractionalDownsamplerTest.EXPECTED_ACCURACY ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + } + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertTrue(downsampledReads.size() >= test.expectedMinNumReadsAfterDownsampling && + downsampledReads.size() <= test.expectedMaxNumReadsAfterDownsampling); + + Assert.assertTrue(downsampler.getNumberOfDiscardedItems() >= test.expectedMinDiscardedItems && + downsampler.getNumberOfDiscardedItems() <= test.expectedMaxDiscardedItems); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.totalReads - downsampledReads.size()); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java new file mode 100644 index 000000000..2717d014c --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/LevelingDownsamplerUnitTest.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.LinkedList; +import java.util.List; + +public class LevelingDownsamplerUnitTest extends BaseTest { + + private static class LevelingDownsamplerUniformStacksTest extends TestDataProvider { + public enum DataStructure { LINKED_LIST, ARRAY_LIST } + + int targetSize; + int numStacks; + int stackSize; + DataStructure dataStructure; + int expectedSize; + + public LevelingDownsamplerUniformStacksTest( int targetSize, int numStacks, int stackSize, DataStructure dataStructure ) { + super(LevelingDownsamplerUniformStacksTest.class); + + this.targetSize = targetSize; + this.numStacks = numStacks; + this.stackSize = stackSize; + this.dataStructure = dataStructure; + expectedSize = calculateExpectedDownsampledStackSize(); + + setName(String.format("%s: targetSize=%d numStacks=%d stackSize=%d dataStructure=%s expectedSize=%d", + getClass().getSimpleName(), targetSize, numStacks, stackSize, dataStructure, expectedSize)); + } + + public Collection> createStacks() { + Collection> stacks = new ArrayList>(); + + for ( int i = 1; i <= numStacks; i++ ) { + List stack = dataStructure == DataStructure.LINKED_LIST ? new LinkedList() : new ArrayList(); + + for ( int j = 1; j <= stackSize; j++ ) { + stack.add(new Object()); + } + + stacks.add(stack); + } + + return stacks; + } + + private int calculateExpectedDownsampledStackSize() { + int numItemsToRemove = numStacks * stackSize - targetSize; + + if ( numStacks == 0 ) { + return 0; + } + else if ( numItemsToRemove <= 0 ) { + return stackSize; + } + + return Math.max(1, stackSize - (numItemsToRemove / numStacks)); + } + } + + @DataProvider(name = "UniformStacksDataProvider") + public Object[][] createUniformStacksTestData() { + for ( int targetSize = 1; targetSize <= 10000; targetSize *= 10 ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + for ( int stackSize = 1; stackSize <= 1000; stackSize *= 10 ) { + for ( LevelingDownsamplerUniformStacksTest.DataStructure dataStructure : LevelingDownsamplerUniformStacksTest.DataStructure.values() ) { + new LevelingDownsamplerUniformStacksTest(targetSize, numStacks, stackSize, dataStructure); + } + } + } + } + + return LevelingDownsamplerUniformStacksTest.getTests(LevelingDownsamplerUniformStacksTest.class); + } + + @Test( dataProvider = "UniformStacksDataProvider" ) + public void testLevelingDownsamplerWithUniformStacks( LevelingDownsamplerUniformStacksTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + Downsampler> downsampler = new LevelingDownsampler, Object>(test.targetSize); + + downsampler.submit(test.createStacks()); + + if ( test.numStacks > 0 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List> downsampledStacks = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledStacks.size(), test.numStacks); + + int totalRemainingItems = 0; + for ( List stack : downsampledStacks ) { + Assert.assertTrue(Math.abs(stack.size() - test.expectedSize) <= 1); + totalRemainingItems += stack.size(); + } + + int numItemsReportedDiscarded = downsampler.getNumberOfDiscardedItems(); + int numItemsActuallyDiscarded = test.numStacks * test.stackSize - totalRemainingItems; + + Assert.assertEquals(numItemsReportedDiscarded, numItemsActuallyDiscarded); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + + Assert.assertTrue(totalRemainingItems <= Math.max(test.targetSize, test.numStacks)); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java new file mode 100644 index 000000000..b9022900b --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PerSampleDownsamplingReadsIteratorUnitTest.java @@ -0,0 +1,298 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.iterators.VerifyingSamIterator; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialMultiSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +public class PerSampleDownsamplingReadsIteratorUnitTest extends BaseTest { + + private static class PerSampleDownsamplingReadsIteratorTest extends TestDataProvider { + + // TODO: tests should distinguish between variance across samples and variance within a sample + + private enum StreamDensity { + SPARSE (MAX_READ_LENGTH, MAX_READ_LENGTH * 2), + DENSE (1, MIN_READ_LENGTH), + MIXED (1, MAX_READ_LENGTH * 2), + UNIFORM_DENSE (1, 1), + UNIFORM_SPARSE (MAX_READ_LENGTH * 2, MAX_READ_LENGTH * 2); + + int minDistanceBetweenStacks; + int maxDistanceBetweenStacks; + + StreamDensity( int minDistanceBetweenStacks, int maxDistanceBetweenStacks ) { + this.minDistanceBetweenStacks = minDistanceBetweenStacks; + this.maxDistanceBetweenStacks = maxDistanceBetweenStacks; + } + + public String toString() { + return String.format("StreamDensity:%d-%d", minDistanceBetweenStacks, maxDistanceBetweenStacks); + } + } + + private enum StreamStackDepth { + NON_UNIFORM_LOW (1, 5), + NON_UNIFORM_HIGH (15, 20), + NON_UNIFORM_MIXED (1, 20), + UNIFORM_SINGLE (1, 1), + UNIFORM_LOW (2, 2), + UNIFORM_HIGH (20, 20), + UNIFORM_MEDIUM (10, 10); // should set target coverage to this value for testing + + int minReadsPerStack; + int maxReadsPerStack; + + StreamStackDepth( int minReadsPerStack, int maxReadsPerStack ) { + this.minReadsPerStack = minReadsPerStack; + this.maxReadsPerStack = maxReadsPerStack; + } + + public boolean isUniform() { + return minReadsPerStack == maxReadsPerStack; + } + + public String toString() { + return String.format("StreamStackDepth:%d-%d", minReadsPerStack, maxReadsPerStack); + } + } + + private enum StreamStacksPerContig { + UNIFORM(20, 20), + NON_UNIFORM(1, 30); + + int minStacksPerContig; + int maxStacksPerContig; + + StreamStacksPerContig( int minStacksPerContig, int maxStacksPerContig ) { + this.minStacksPerContig = minStacksPerContig; + this.maxStacksPerContig = maxStacksPerContig; + } + + public boolean isUniform() { + return minStacksPerContig == maxStacksPerContig; + } + + public String toString() { + return String.format("StreamStacksPerContig:%d-%d", minStacksPerContig, maxStacksPerContig); + } + } + + // Not interested in testing multiple ranges for the read lengths, as none of our current + // downsamplers are affected by read length + private static final int MIN_READ_LENGTH = 50; + private static final int MAX_READ_LENGTH = 150; + + private ReadsDownsamplerFactory downsamplerFactory; + private int targetCoverage; + private int numSamples; + private int minContigs; + private int maxContigs; + private StreamDensity streamDensity; + private StreamStackDepth streamStackDepth; + private StreamStacksPerContig streamStacksPerContig; + private double unmappedReadsFraction; + private int unmappedReadsCount; + private boolean verifySortedness; + + private ArtificialMultiSampleReadStream mergedReadStream; + private Map perSampleArtificialReadStreams; + private Map perSampleStreamAnalyzers; + private SAMFileHeader header; + + public PerSampleDownsamplingReadsIteratorTest( ReadsDownsamplerFactory downsamplerFactory, + int targetCoverage, + int numSamples, + int minContigs, + int maxContigs, + StreamDensity streamDensity, + StreamStackDepth streamStackDepth, + StreamStacksPerContig streamStacksPerContig, + double unmappedReadsFraction, + int unmappedReadsCount, + boolean verifySortedness ) { + super(PerSampleDownsamplingReadsIteratorTest.class); + + this.downsamplerFactory = downsamplerFactory; + this.targetCoverage = targetCoverage; + this.numSamples = numSamples; + this.minContigs = minContigs; + this.maxContigs = maxContigs; + this.streamDensity = streamDensity; + this.streamStackDepth = streamStackDepth; + this.streamStacksPerContig = streamStacksPerContig; + this.unmappedReadsFraction = unmappedReadsFraction; + this.unmappedReadsCount = unmappedReadsCount; + this.verifySortedness = verifySortedness; + + header = createHeader(); + createReadStreams(); + + setName(String.format("%s: targetCoverage=%d numSamples=%d minContigs=%d maxContigs=%d %s %s %s unmappedReadsFraction=%.2f unmappedReadsCount=%d verifySortedness=%b", + getClass().getSimpleName(), targetCoverage, numSamples, minContigs, maxContigs, streamDensity, streamStackDepth, streamStacksPerContig, unmappedReadsFraction, unmappedReadsCount, verifySortedness)); + } + + private SAMFileHeader createHeader() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(maxContigs, 1, (streamDensity.maxDistanceBetweenStacks + MAX_READ_LENGTH) * streamStacksPerContig.maxStacksPerContig + 100000); + List readGroups = new ArrayList(numSamples); + List sampleNames = new ArrayList(numSamples); + + for ( int i = 0; i < numSamples; i++ ) { + readGroups.add("ReadGroup" + i); + sampleNames.add("Sample" + i); + } + + return ArtificialSAMUtils.createEnumeratedReadGroups(header, readGroups, sampleNames); + } + + private void createReadStreams() { + perSampleArtificialReadStreams = new HashMap(numSamples); + perSampleStreamAnalyzers = new HashMap(numSamples); + + for (SAMReadGroupRecord readGroup : header.getReadGroups() ) { + String readGroupID = readGroup.getReadGroupId(); + String sampleName = readGroup.getSample(); + + int thisSampleNumContigs = MathUtils.randomIntegerInRange(minContigs, maxContigs); + int thisSampleStacksPerContig = MathUtils.randomIntegerInRange(streamStacksPerContig.minStacksPerContig, streamStacksPerContig.maxStacksPerContig); + + int thisSampleNumUnmappedReads = GenomeAnalysisEngine.getRandomGenerator().nextDouble() < unmappedReadsFraction ? unmappedReadsCount : 0; + + ArtificialSingleSampleReadStream thisSampleStream = new ArtificialSingleSampleReadStream(header, + readGroupID, + thisSampleNumContigs, + thisSampleStacksPerContig, + streamStackDepth.minReadsPerStack, + streamStackDepth.maxReadsPerStack, + streamDensity.minDistanceBetweenStacks, + streamDensity.maxDistanceBetweenStacks, + MIN_READ_LENGTH, + MAX_READ_LENGTH, + thisSampleNumUnmappedReads); + perSampleArtificialReadStreams.put(sampleName, thisSampleStream); + perSampleStreamAnalyzers.put(sampleName, new PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer(thisSampleStream, targetCoverage)); + } + + mergedReadStream = new ArtificialMultiSampleReadStream(perSampleArtificialReadStreams.values()); + } + + public void run() { + StingSAMIterator downsamplingIter = new PerSampleDownsamplingReadsIterator(mergedReadStream.getStingSAMIterator(), downsamplerFactory); + + if ( verifySortedness ) { + downsamplingIter = new VerifyingSamIterator(downsamplingIter); + } + + while ( downsamplingIter.hasNext() ) { + SAMRecord read = downsamplingIter.next(); + String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null; + + ArtificialSingleSampleReadStreamAnalyzer analyzer = perSampleStreamAnalyzers.get(sampleName); + if ( analyzer != null ) { + analyzer.update(read); + } + else { + throw new ReviewedStingException("bug: stream analyzer for sample " + sampleName + " not found"); + } + } + + for ( Map.Entry analyzerEntry : perSampleStreamAnalyzers.entrySet() ) { + ArtificialSingleSampleReadStreamAnalyzer analyzer = analyzerEntry.getValue(); + analyzer.finalizeStats(); + + // Validate the downsampled read stream for each sample individually + analyzer.validate(); + } + + // Allow memory used by this test to be reclaimed: + mergedReadStream = null; + perSampleArtificialReadStreams = null; + perSampleStreamAnalyzers = null; + } + } + + @DataProvider(name = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public Object[][] createPerSampleDownsamplingReadsIteratorTests() { + + GenomeAnalysisEngine.resetRandomGenerator(); + + // Some values don't vary across tests + int targetCoverage = PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.UNIFORM_MEDIUM.minReadsPerStack; + ReadsDownsamplerFactory downsamplerFactory = new SimplePositionalDownsamplerFactory(targetCoverage); + int maxContigs = 3; + boolean verifySortedness = true; + + for ( int numSamples : Arrays.asList(1, 2, 10) ) { + for ( int minContigs = 1; minContigs <= maxContigs; minContigs++ ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamDensity streamDensity : PerSampleDownsamplingReadsIteratorTest.StreamDensity.values() ) { + for ( PerSampleDownsamplingReadsIteratorTest.StreamStackDepth streamStackDepth : PerSampleDownsamplingReadsIteratorTest.StreamStackDepth.values() ) { + for (PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig streamStacksPerContig : PerSampleDownsamplingReadsIteratorTest.StreamStacksPerContig.values() ) { + for ( double unmappedReadsFraction : Arrays.asList(0.0, 1.0, 0.5) ) { + for ( int unmappedReadsCount : Arrays.asList(1, 50) ) { + new PerSampleDownsamplingReadsIteratorTest(downsamplerFactory, + targetCoverage, + numSamples, + minContigs, + maxContigs, + streamDensity, + streamStackDepth, + streamStacksPerContig, + unmappedReadsFraction, + unmappedReadsCount, + verifySortedness); + } + } + } + } + } + } + } + + return PerSampleDownsamplingReadsIteratorTest.getTests(PerSampleDownsamplingReadsIteratorTest.class); + } + + @Test(dataProvider = "PerSampleDownsamplingReadsIteratorTestDataProvider") + public void runPerSampleDownsamplingReadsIteratorTest( PerSampleDownsamplingReadsIteratorTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java deleted file mode 100644 index b1d8e45c9..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionalDownsamplerUnitTest.java +++ /dev/null @@ -1,357 +0,0 @@ -package org.broadinstitute.sting.gatk.downsampling; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.annotations.Test; -import org.testng.Assert; - -import java.util.*; - -// TODO: generalize these tests so that all possible arrangements of 1-4 stacks can be tested -public class PositionalDownsamplerUnitTest extends BaseTest { - - /** - * ------- - * ------- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeOverlappingIdenticalStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ------- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeNonOverlappingIdenticalStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 201, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 301, 100)); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeNonOverlappingIdenticalStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) == 1000); - Assert.assertTrue(downsampledStackSizes.get(1) == 1000); - Assert.assertTrue(downsampledStackSizes.get(2) == 1000); - } - - /** - * --- - * --- - * ------- - * ------- - * ------- - * ------- - */ - @Test - public void testThreeStacksWithShortStackAtBeginning() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 20, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackAtBeginning: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ------- - * --- - * --- - * ------- - * ------- - */ - @Test - public void testThreeStacksWithShortStackInMiddle() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 25, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 75, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackInMiddle: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------ - * ------ - * ------- - * ------- - * --- - * --- - */ - @Test - public void testThreeStacksWithShortStackAtEnd() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(1500, header, "foo", 0, 135, 25)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreeStacksWithShortStackAtEnd: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(0) + downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) + downsampledStackSizes.get(2) <= 1000); - } - - /** - * ------- - * ---- - * ------- - * ---- - * ------- - * ------- - */ - @Test - public void testThreePartiallyOverlappingStacks() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 1, 100, 50)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfVaryingReads(2000, header, "foo", 0, 75, 100, 50)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(2000, header, "foo", 0, 150, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testThreePartiallyOverlappingStacks: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(1) <= 1000); - Assert.assertTrue(downsampledStackSizes.get(2) <= 1000); - - // TODO: need to examine per-base coverage here - } - - @Test - public void testNoDownsamplingRequired() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 1, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 25, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.submit(createStackOfIdenticalReads(300, header, "foo", 0, 50, 100)); - Assert.assertFalse(downsampler.hasDownsampledItems()); - Assert.assertTrue(downsampler.hasPendingItems()); - - downsampler.signalEndOfInput(); - Assert.assertTrue(downsampler.hasDownsampledItems()); - Assert.assertFalse(downsampler.hasPendingItems()); - - List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampler.consumeDownsampledItems()); - - System.out.println("testNoDownsamplingRequired: Downsampled Stack sizes: " + downsampledStackSizes); - - Assert.assertEquals(downsampledStackSizes.size(), 3); - Assert.assertTrue(downsampledStackSizes.get(0) == 300); - Assert.assertTrue(downsampledStackSizes.get(1) == 300); - Assert.assertTrue(downsampledStackSizes.get(2) == 300); - } - - @Test - public void testGATKSAMRecordSupport() { - SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); - PositionalDownsampler downsampler = new PositionalDownsampler(1000); - - List reads = new ArrayList(); - for ( int i = 0; i < 10; i++ ) { - reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); - } - - downsampler.submit(reads); - downsampler.signalEndOfInput(); - List downsampledReads = downsampler.consumeDownsampledItems(); - - Assert.assertTrue(downsampledReads.size() == 10); - } - - private ArrayList createStackOfIdenticalReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int length ) { - ArrayList stack = new ArrayList(stackSize); - for ( int i = 1; i <= stackSize; i++ ) { - stack.add(ArtificialSAMUtils.createArtificialRead(header, name, refIndex, alignmentStart, length)); - } - return stack; - } - - private ArrayList createStackOfVaryingReads( int stackSize, SAMFileHeader header, String name, int refIndex, int alignmentStart, int firstLength, int secondLength ) { - ArrayList stack = createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, firstLength); - stack.addAll(createStackOfIdenticalReads(stackSize / 2, header, name, refIndex, alignmentStart, secondLength)); - return stack; - } - - private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { - List stackSizes = new ArrayList(); - Iterator iter = downsampledReads.iterator(); - Assert.assertTrue(iter.hasNext()); - - SAMRecord previousRead = iter.next(); - int currentStackSize = 1; - - while ( iter.hasNext() ) { - SAMRecord currentRead = iter.next(); - - if ( ! currentRead.getReferenceIndex().equals(previousRead.getReferenceIndex()) || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { - stackSizes.add(currentStackSize); - currentStackSize = 1; - } - else if ( currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { - Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); - } - else { - currentStackSize++; - } - - previousRead = currentRead; - } - - stackSizes.add(currentStackSize); - return stackSizes; - } -} - diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java new file mode 100644 index 000000000..9cbd0db8a --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer.java @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStream; +import org.broadinstitute.sting.utils.sam.ArtificialSingleSampleReadStreamAnalyzer; + +/** + * Class for analyzing an artificial read stream that has been positionally downsampled, and verifying + * that the downsampling was done correctly without changing the stream in unexpected ways. + * + * @author David Roazen + */ +public class PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer extends ArtificialSingleSampleReadStreamAnalyzer { + private int targetCoverage; + + public PositionallyDownsampledArtificialSingleSampleReadStreamAnalyzer( ArtificialSingleSampleReadStream originalStream, int targetCoverage ) { + super(originalStream); + this.targetCoverage = targetCoverage; + } + + /** + * Overridden validate() method that checks for the effects of positional downsampling in addition to checking + * for whether the original properties of the stream not affected by downsampling have been preserved + */ + @Override + public void validate() { + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && originalStream.getNumUnmappedReads() == 0 ) { + if ( totalReads != 0 ) { + throw new ReviewedStingException("got reads from the stream, but the stream was configured to have 0 reads"); + } + return; // no further validation needed for the 0-reads case + } + else if ( totalReads == 0 ) { + throw new ReviewedStingException("got no reads from the stream, but the stream was configured to have > 0 reads"); + } + + if ( ! allSamplesMatch ) { + throw new ReviewedStingException("some reads had the wrong sample"); + } + + if ( numContigs != originalStream.getNumContigs() ) { + throw new ReviewedStingException("number of contigs not correct"); + } + + if ( stacksPerContig.size() != originalStream.getNumContigs() ) { + throw new ReviewedStingException(String.format("bug in analyzer code: calculated sizes for %d contigs even though there were only %d contigs", + stacksPerContig.size(), originalStream.getNumContigs())); + } + + for ( int contigStackCount : stacksPerContig ) { + if ( contigStackCount != originalStream.getNumStacksPerContig() ) { + throw new ReviewedStingException("contig had incorrect number of stacks"); + } + } + + if ( originalStream.getNumStacksPerContig() > 0 ) { + + // Check for the effects of positional downsampling: + int stackMinimumAfterDownsampling = Math.min(targetCoverage, originalStream.getMinReadsPerStack()); + int stackMaximumAfterDownsampling = targetCoverage; + + if ( minReadsPerStack < stackMinimumAfterDownsampling ) { + throw new ReviewedStingException("stack had fewer than the minimum number of reads after downsampling"); + } + if ( maxReadsPerStack > stackMaximumAfterDownsampling ) { + throw new ReviewedStingException("stack had more than the maximum number of reads after downsampling"); + } + } + else if ( minReadsPerStack != null || maxReadsPerStack != null ) { + throw new ReviewedStingException("bug in analyzer code: reads per stack was calculated even though 0 stacks per contig was specified"); + } + + if ( originalStream.getNumStacksPerContig() > 1 ) { + if ( minDistanceBetweenStacks < originalStream.getMinDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by less than the minimum distance"); + } + if ( maxDistanceBetweenStacks > originalStream.getMaxDistanceBetweenStacks() ) { + throw new ReviewedStingException("stacks were separated by more than the maximum distance"); + } + } + else if ( minDistanceBetweenStacks != null || maxDistanceBetweenStacks != null ) { + throw new ReviewedStingException("bug in analyzer code: distance between stacks was calculated even though numStacksPerContig was <= 1"); + } + + if ( minReadLength < originalStream.getMinReadLength() ) { + throw new ReviewedStingException("read was shorter than the minimum allowed length"); + } + if ( maxReadLength > originalStream.getMaxReadLength() ) { + throw new ReviewedStingException("read was longer than the maximum allowed length"); + } + + if ( numUnmappedReads != originalStream.getNumUnmappedReads() ) { + throw new ReviewedStingException(String.format("wrong number of unmapped reads: requested %d but saw %d", + originalStream.getNumUnmappedReads(), numUnmappedReads)); + } + + if ( (originalStream.getNumContigs() == 0 || originalStream.getNumStacksPerContig() == 0) && + numUnmappedReads != totalReads ) { + throw new ReviewedStingException("stream should have consisted only of unmapped reads, but saw some mapped reads"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java new file mode 100644 index 000000000..75d0448c4 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/ReservoirDownsamplerUnitTest.java @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +public class ReservoirDownsamplerUnitTest extends BaseTest { + + private static class ReservoirDownsamplerTest extends TestDataProvider { + int reservoirSize; + int totalReads; + int expectedNumReadsAfterDownsampling; + int expectedNumDiscardedItems; + + public ReservoirDownsamplerTest( int reservoirSize, int totalReads ) { + super(ReservoirDownsamplerTest.class); + + this.reservoirSize = reservoirSize; + this.totalReads = totalReads; + + expectedNumReadsAfterDownsampling = Math.min(reservoirSize, totalReads); + expectedNumDiscardedItems = totalReads <= reservoirSize ? 0 : totalReads - reservoirSize; + + setName(String.format("%s: reservoirSize=%d totalReads=%d expectedNumReadsAfterDownsampling=%d expectedNumDiscardedItems=%d", + getClass().getSimpleName(), reservoirSize, totalReads, expectedNumReadsAfterDownsampling, expectedNumDiscardedItems)); + } + + public Collection createReads() { + Collection reads = new ArrayList(totalReads); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(totalReads, header, "foo", 0, 1, 100)); + + return reads; + } + } + + @DataProvider(name = "ReservoirDownsamplerTestDataProvider") + public Object[][] createReservoirDownsamplerTestData() { + for ( int reservoirSize = 1; reservoirSize <= 10000; reservoirSize *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, 0); + for ( int totalReads = 1; totalReads <= 10000; totalReads *= 10 ) { + new ReservoirDownsamplerTest(reservoirSize, totalReads); + } + } + + return ReservoirDownsamplerTest.getTests(ReservoirDownsamplerTest.class); + } + + @Test(dataProvider = "ReservoirDownsamplerTestDataProvider") + public void testReservoirDownsampler( ReservoirDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new ReservoirDownsampler(test.reservoirSize); + + downsampler.submit(test.createReads()); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.totalReads > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + Assert.assertEquals(downsampledReads.size(), test.expectedNumReadsAfterDownsampling); + + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), test.expectedNumDiscardedItems); + Assert.assertEquals(test.totalReads - downsampledReads.size(), test.expectedNumDiscardedItems); + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java new file mode 100644 index 000000000..5dc41b4a0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/downsampling/SimplePositionalDownsamplerUnitTest.java @@ -0,0 +1,330 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.util.*; + +public class SimplePositionalDownsamplerUnitTest extends BaseTest { + + private static class SimplePositionalDownsamplerTest extends TestDataProvider { + int targetCoverage; + int numStacks; + List stackSizes; + List expectedStackSizes; + boolean multipleContigs; + int totalInitialReads; + + public SimplePositionalDownsamplerTest( int targetCoverage, List stackSizes, boolean multipleContigs ) { + super(SimplePositionalDownsamplerTest.class); + + this.targetCoverage = targetCoverage; + this.numStacks = stackSizes.size(); + this.stackSizes = stackSizes; + this.multipleContigs = multipleContigs; + + calculateExpectedDownsampledStackSizes(); + + totalInitialReads = 0; + for ( Integer stackSize : stackSizes ) { + totalInitialReads += stackSize; + } + + setName(String.format("%s: targetCoverage=%d numStacks=%d stackSizes=%s expectedSizes=%s multipleContigs=%b", + getClass().getSimpleName(), targetCoverage, numStacks, stackSizes, expectedStackSizes, multipleContigs)); + } + + public Collection createReads() { + Collection reads = new ArrayList(); + SAMFileHeader header = multipleContigs ? + ArtificialSAMUtils.createArtificialSamHeader(2, 1, 1000000) : + ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + int refIndex = 0; + int alignmentStart = 1; + int readLength = 100; + + for ( int i = 0; i < numStacks; i++ ) { + if ( multipleContigs && refIndex == 0 && i >= numStacks / 2 ) { + refIndex++; + } + + reads.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(stackSizes.get(i), header, "foo", + refIndex, alignmentStart, readLength)); + + alignmentStart += 10; + } + + return reads; + } + + private void calculateExpectedDownsampledStackSizes() { + expectedStackSizes = new ArrayList(numStacks); + + for ( Integer stackSize : stackSizes ) { + int expectedSize = targetCoverage >= stackSize ? stackSize : targetCoverage; + expectedStackSizes.add(expectedSize); + } + } + } + + @DataProvider(name = "SimplePositionalDownsamplerTestDataProvider") + public Object[][] createSimplePositionalDownsamplerTestData() { + GenomeAnalysisEngine.resetRandomGenerator(); + + for ( int targetCoverage = 1; targetCoverage <= 10000; targetCoverage *= 10 ) { + for ( int contigs = 1; contigs <= 2; contigs++ ) { + for ( int numStacks = 0; numStacks <= 10; numStacks++ ) { + List stackSizes = new ArrayList(numStacks); + for ( int stack = 1; stack <= numStacks; stack++ ) { + stackSizes.add(GenomeAnalysisEngine.getRandomGenerator().nextInt(targetCoverage * 2) + 1); + } + new SimplePositionalDownsamplerTest(targetCoverage, stackSizes, contigs > 1); + } + } + } + + return SimplePositionalDownsamplerTest.getTests(SimplePositionalDownsamplerTest.class); + } + + @Test( dataProvider = "SimplePositionalDownsamplerTestDataProvider" ) + public void testSimplePostionalDownsampler( SimplePositionalDownsamplerTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + + ReadsDownsampler downsampler = new SimplePositionalDownsampler(test.targetCoverage); + + downsampler.submit(test.createReads()); + + if ( test.numStacks > 1 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else if ( test.numStacks == 1 ) { + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + downsampler.signalEndOfInput(); + + if ( test.numStacks > 0 ) { + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + } + else { + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + } + + List downsampledReads = downsampler.consumeFinalizedItems(); + Assert.assertFalse(downsampler.hasFinalizedItems() || downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekFinalized() == null && downsampler.peekPending() == null); + + if ( test.numStacks == 0 ) { + Assert.assertTrue(downsampledReads.isEmpty()); + } + else { + List downsampledStackSizes = getDownsampledStackSizesAndVerifySortedness(downsampledReads); + + Assert.assertEquals(downsampledStackSizes.size(), test.numStacks); + Assert.assertEquals(downsampledStackSizes, test.expectedStackSizes); + + int numReadsActuallyEliminated = test.totalInitialReads - downsampledReads.size(); + int numReadsReportedEliminated = downsampler.getNumberOfDiscardedItems(); + Assert.assertEquals(numReadsActuallyEliminated, numReadsReportedEliminated); + } + + downsampler.reset(); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 0); + } + + private List getDownsampledStackSizesAndVerifySortedness( List downsampledReads ) { + List stackSizes = new ArrayList(); + + if ( downsampledReads.isEmpty() ) { + return stackSizes; + } + + Iterator iter = downsampledReads.iterator(); + Assert.assertTrue(iter.hasNext()); + + SAMRecord previousRead = iter.next(); + int currentStackSize = 1; + + while ( iter.hasNext() ) { + SAMRecord currentRead = iter.next(); + + if ( currentRead.getReferenceIndex() > previousRead.getReferenceIndex() || currentRead.getAlignmentStart() > previousRead.getAlignmentStart() ) { + stackSizes.add(currentStackSize); + currentStackSize = 1; + } + else if ( currentRead.getReferenceIndex() < previousRead.getReferenceIndex() || currentRead.getAlignmentStart() < previousRead.getAlignmentStart() ) { + Assert.fail(String.format("Reads are out of order: %s %s", previousRead, currentRead)); + } + else { + currentStackSize++; + } + + previousRead = currentRead; + } + + stackSizes.add(currentStackSize); + return stackSizes; + } + + @Test + public void testSimplePositionalDownsamplerSignalNoMoreReadsBefore() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(50, header, "foo", 0, 1, 100)); + downsampler.submit(readStack); + + Assert.assertFalse(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() == null); + Assert.assertTrue(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() != null); + + SAMRecord laterRead = ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 2, 100); + downsampler.signalNoMoreReadsBefore(laterRead); + + Assert.assertTrue(downsampler.hasFinalizedItems()); + Assert.assertTrue(downsampler.peekFinalized() != null); + Assert.assertFalse(downsampler.hasPendingItems()); + Assert.assertTrue(downsampler.peekPending() == null); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), readStack.size()); + } + + @Test + public void testBasicUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection readStack = new ArrayList(); + readStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : readStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(readStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), readStack.size()); + + for ( SAMRecord read: downsampledReads ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + } + + @Test + public void testMixedMappedAndUnmappedReadsSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(100); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + Collection mappedReadStack = new ArrayList(); + mappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", 0, 1, 100)); + for ( SAMRecord read : mappedReadStack ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + + Collection unmappedReadStack = new ArrayList(); + unmappedReadStack.addAll(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(200, header, "foo", SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX, + SAMRecord.NO_ALIGNMENT_START, 100)); + for ( SAMRecord read : unmappedReadStack ) { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + downsampler.submit(mappedReadStack); + downsampler.submit(unmappedReadStack); + downsampler.signalEndOfInput(); + + List downsampledReads = downsampler.consumeFinalizedItems(); + + // Unmapped reads should not get downsampled at all by the SimplePositionalDownsampler + Assert.assertEquals(downsampledReads.size(), 300); + Assert.assertEquals(downsampler.getNumberOfDiscardedItems(), 100); + + int count = 1; + for ( SAMRecord read: downsampledReads ) { + if ( count <= 100 ) { + Assert.assertFalse(read.getReadUnmappedFlag()); + } + else { + Assert.assertTrue(read.getReadUnmappedFlag()); + } + + count++; + } + } + + @Test + public void testGATKSAMRecordSupport() { + ReadsDownsampler downsampler = new SimplePositionalDownsampler(1000); + + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000000); + + List reads = new ArrayList(); + for ( int i = 0; i < 10; i++ ) { + reads.add(ArtificialSAMUtils.createArtificialRead(header, "foo", 0, 10, 20 * i + 10)); + } + + downsampler.submit(reads); + downsampler.signalEndOfInput(); + List downsampledReads = downsampler.consumeFinalizedItems(); + + Assert.assertEquals(downsampledReads.size(), 10); + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java new file mode 100644 index 000000000..a49a602c6 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateExperimentalUnitTest.java @@ -0,0 +1,648 @@ +package org.broadinstitute.sting.gatk.iterators; + +import net.sf.samtools.*; +import net.sf.samtools.util.CloseableIterator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; +import org.broadinstitute.sting.gatk.filters.ReadFilter; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * testing of the experimental version of LocusIteratorByState + */ +public class LocusIteratorByStateExperimentalUnitTest extends BaseTest { + private static SAMFileHeader header; + private LocusIteratorByStateExperimental li; + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void beforeClass() { + header = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1000); + genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); + } + + private LocusIteratorByStateExperimental makeLTBS(List reads, ReadProperties readAttributes) { + return new LocusIteratorByStateExperimental(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByStateExperimental.sampleListForSAMWithoutReadGroups()); + } + + @Test + public void testXandEQOperators() { + final byte[] bases1 = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] bases2 = new byte[] {'A','A','A','C','A','A','A','A','A','C'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord r1 = ArtificialSAMUtils.createArtificialRead(header,"r1",0,1,10); + r1.setReadBases(bases1); + r1.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r1.setCigarString("10M"); + + SAMRecord r2 = ArtificialSAMUtils.createArtificialRead(header,"r2",0,1,10); + r2.setReadBases(bases2); + r2.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r2.setCigarString("3=1X5=1X"); + + SAMRecord r3 = ArtificialSAMUtils.createArtificialRead(header,"r3",0,1,10); + r3.setReadBases(bases2); + r3.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + r3.setCigarString("3=1X5M1X"); + + SAMRecord r4 = ArtificialSAMUtils.createArtificialRead(header,"r4",0,1,10); + r4.setReadBases(bases2); + r4.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + r4.setCigarString("10M"); + + List reads = Arrays.asList(r1, r2, r3, r4); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup(); + Assert.assertEquals(pileup.depthOfCoverage(), 4); + } + } + + @Test + public void testIndelsInRegularPileup() { + final byte[] bases = new byte[] {'A','A','A','A','A','A','A','A','A','A'}; + final byte[] indelBases = new byte[] {'A','A','A','A','C','T','A','A','A','A','A','A'}; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord before = ArtificialSAMUtils.createArtificialRead(header,"before",0,1,10); + before.setReadBases(bases); + before.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + before.setCigarString("10M"); + + SAMRecord during = ArtificialSAMUtils.createArtificialRead(header,"during",0,2,10); + during.setReadBases(indelBases); + during.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20,20,20}); + during.setCigarString("4M2I6M"); + + SAMRecord after = ArtificialSAMUtils.createArtificialRead(header,"after",0,3,10); + after.setReadBases(bases); + after.setBaseQualities(new byte[] {20,20,20,20,20,20,20,20,20,20}); + after.setCigarString("10M"); + + List reads = Arrays.asList(before, during, after); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads,readAttributes); + + boolean foundIndel = false; + while (li.hasNext()) { + AlignmentContext context = li.next(); + ReadBackedPileup pileup = context.getBasePileup().getBaseFilteredPileup(10); + for (PileupElement p : pileup) { + if (p.isBeforeInsertion()) { + foundIndel = true; + Assert.assertEquals(p.getEventLength(), 2, "Wrong event length"); + Assert.assertEquals(p.getEventBases(), "CT", "Inserted bases are incorrect"); + break; + } + } + + } + + Assert.assertTrue(foundIndel,"Indel in pileup not found"); + } + + @Test + public void testWholeIndelReadInIsolation() { + final int firstLocus = 44367789; + + // create a test version of the Reads object + ReadProperties readAttributes = createTestReadProperties(); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header, "indelOnly", 0, firstLocus, 76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte)'A',76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte) '@', 76)); + indelOnlyRead.setCigarString("76I"); + + List reads = Arrays.asList(indelOnlyRead); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, readAttributes); + + // Traditionally, reads that end with indels bleed into the pileup at the following locus. Verify that the next pileup contains this read + // and considers it to be an indel-containing read. + Assert.assertTrue(li.hasNext(),"Should have found a whole-indel read in the normal base pileup without extended events enabled"); + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(), firstLocus, "Base pileup is at incorrect location."); + ReadBackedPileup basePileup = alignmentContext.getBasePileup(); + Assert.assertEquals(basePileup.getReads().size(),1,"Pileup is of incorrect size"); + Assert.assertSame(basePileup.getReads().get(0), indelOnlyRead, "Read in pileup is incorrect"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) do + * not negatively influence the ordering of the pileup. + */ + @Test + public void testWholeIndelRead() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord leadingRead = ArtificialSAMUtils.createArtificialRead(header,"leading",0,firstLocus,76); + leadingRead.setReadBases(Utils.dupBytes((byte)'A',76)); + leadingRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + leadingRead.setCigarString("1M75I"); + + SAMRecord indelOnlyRead = ArtificialSAMUtils.createArtificialRead(header,"indelOnly",0,secondLocus,76); + indelOnlyRead.setReadBases(Utils.dupBytes((byte) 'A', 76)); + indelOnlyRead.setBaseQualities(Utils.dupBytes((byte)'@',76)); + indelOnlyRead.setCigarString("76I"); + + SAMRecord fullMatchAfterIndel = ArtificialSAMUtils.createArtificialRead(header,"fullMatch",0,secondLocus,76); + fullMatchAfterIndel.setReadBases(Utils.dupBytes((byte)'A',76)); + fullMatchAfterIndel.setBaseQualities(Utils.dupBytes((byte)'@',76)); + fullMatchAfterIndel.setCigarString("75I1M"); + + List reads = Arrays.asList(leadingRead, indelOnlyRead, fullMatchAfterIndel); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + int currentLocus = firstLocus; + int numAlignmentContextsFound = 0; + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + Assert.assertEquals(alignmentContext.getLocation().getStart(),currentLocus,"Current locus returned by alignment context is incorrect"); + + if(currentLocus == firstLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),1,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),leadingRead,"leadingRead absent from pileup at locus " + currentLocus); + } + else if(currentLocus == secondLocus) { + List readsAtLocus = alignmentContext.getBasePileup().getReads(); + Assert.assertEquals(readsAtLocus.size(),2,"Wrong number of reads at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(0),indelOnlyRead,"indelOnlyRead absent from pileup at locus " + currentLocus); + Assert.assertSame(readsAtLocus.get(1),fullMatchAfterIndel,"fullMatchAfterIndel absent from pileup at locus " + currentLocus); + } + + currentLocus++; + numAlignmentContextsFound++; + } + + Assert.assertEquals(numAlignmentContextsFound, 2, "Found incorrect number of alignment contexts"); + } + + /** + * Test to make sure that reads supporting only an indel (example cigar string: 76I) are represented properly + */ + @Test + public void testWholeIndelReadRepresentedTest() { + final int firstLocus = 44367788, secondLocus = firstLocus + 1; + + SAMRecord read1 = ArtificialSAMUtils.createArtificialRead(header,"read1",0,secondLocus,1); + read1.setReadBases(Utils.dupBytes((byte) 'A', 1)); + read1.setBaseQualities(Utils.dupBytes((byte) '@', 1)); + read1.setCigarString("1I"); + + List reads = Arrays.asList(read1); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "A"); + } + + SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(header,"read2",0,secondLocus,10); + read2.setReadBases(Utils.dupBytes((byte) 'A', 10)); + read2.setBaseQualities(Utils.dupBytes((byte) '@', 10)); + read2.setCigarString("10I"); + + reads = Arrays.asList(read2); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(reads, createTestReadProperties()); + + while(li.hasNext()) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + Assert.assertTrue(pe.isBeforeInsertion()); + Assert.assertFalse(pe.isAfterInsertion()); + Assert.assertEquals(pe.getEventBases(), "AAAAAAAAAA"); + } + } + + //////////////////////////////////////////// + // comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////// + + private static class LIBSTest { + + + final String cigar; + final int readLength; + + private LIBSTest(final String cigar, final int readLength) { + this.cigar = cigar; + this.readLength = readLength; + } + } + + @DataProvider(name = "LIBSTest") + public Object[][] createLIBSTestData() { + + //TODO -- when LIBS is fixed this should be replaced to provide all possible permutations of CIGAR strings + + return new Object[][]{ + {new LIBSTest("1I", 1)}, + {new LIBSTest("10I", 10)}, + {new LIBSTest("2M2I2M", 6)}, + {new LIBSTest("2M2I", 4)}, + //TODO -- uncomment these when LIBS is fixed + //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, + //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, + //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, + //{new LIBSTest("1M2D2M", 3)}, + {new LIBSTest("1S1M", 2)}, + {new LIBSTest("1M1S", 2)}, + {new LIBSTest("1S1M1I", 3)} + }; + } + + @Test(dataProvider = "LIBSTest") + public void testLIBS(LIBSTest params) { + final int locus = 44367788; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); + read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); + read.setCigarString(params.cigar); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + final LIBS_position tester = new LIBS_position(read); + + while ( li.hasNext() ) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + + tester.stepForwardOnGenome(); + + Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase); + Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart); + Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase); + Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd); + Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion); + Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion); + Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip); + Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset()); + } + } + + //////////////////////////////////////////////// + // End comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////////// + + + /////////////////////////////////////// + // Read State Manager Tests // + /////////////////////////////////////// + + private class PerSampleReadStateManagerTest extends TestDataProvider { + private List readCountsPerAlignmentStart; + private List reads; + private List> recordStatesByAlignmentStart; + private int removalInterval; + + public PerSampleReadStateManagerTest( List readCountsPerAlignmentStart, int removalInterval ) { + super(PerSampleReadStateManagerTest.class); + + this.readCountsPerAlignmentStart = readCountsPerAlignmentStart; + this.removalInterval = removalInterval; + + reads = new ArrayList(); + recordStatesByAlignmentStart = new ArrayList>(); + + setName(String.format("%s: readCountsPerAlignmentStart: %s removalInterval: %d", + getClass().getSimpleName(), readCountsPerAlignmentStart, removalInterval)); + } + + public void run() { + LocusIteratorByStateExperimental libs = makeLTBS(new ArrayList(), createTestReadProperties()); + LocusIteratorByStateExperimental.ReadStateManager readStateManager = + libs.new ReadStateManager(new ArrayList().iterator()); + LocusIteratorByStateExperimental.ReadStateManager.PerSampleReadStateManager perSampleReadStateManager = + readStateManager.new PerSampleReadStateManager(); + + makeReads(); + + for ( ArrayList stackRecordStates : recordStatesByAlignmentStart ) { + perSampleReadStateManager.addStatesAtNextAlignmentStart(stackRecordStates); + } + + // read state manager should have the right number of reads + Assert.assertEquals(reads.size(), perSampleReadStateManager.size()); + + Iterator originalReadsIterator = reads.iterator(); + Iterator recordStateIterator = perSampleReadStateManager.iterator(); + int recordStateCount = 0; + int numReadStatesRemoved = 0; + + // Do a first-pass validation of the record state iteration by making sure we get back everything we + // put in, in the same order, doing any requested removals of read states along the way + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + recordStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + SAMRecord originalRead = originalReadsIterator.next(); + + // The read we get back should be literally the same read in memory as we put in + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + + // If requested, remove a read state every removalInterval states + if ( removalInterval > 0 && recordStateCount % removalInterval == 0 ) { + recordStateIterator.remove(); + numReadStatesRemoved++; + } + } + + Assert.assertFalse(originalReadsIterator.hasNext()); + + // If we removed any read states, do a second pass through the read states to make sure the right + // states were removed + if ( numReadStatesRemoved > 0 ) { + Assert.assertEquals(perSampleReadStateManager.size(), reads.size() - numReadStatesRemoved); + + originalReadsIterator = reads.iterator(); + recordStateIterator = perSampleReadStateManager.iterator(); + int readCount = 0; + int readStateCount = 0; + + // Match record states with the reads that should remain after removal + while ( recordStateIterator.hasNext() ) { + LocusIteratorByStateExperimental.SAMRecordState readState = recordStateIterator.next(); + readStateCount++; + SAMRecord readFromPerSampleReadStateManager = readState.getRead(); + + Assert.assertTrue(originalReadsIterator.hasNext()); + + SAMRecord originalRead = originalReadsIterator.next(); + readCount++; + + if ( readCount % removalInterval == 0 ) { + originalRead = originalReadsIterator.next(); // advance to next read, since the previous one should have been discarded + readCount++; + } + + // The read we get back should be literally the same read in memory as we put in (after accounting for removals) + Assert.assertTrue(originalRead == readFromPerSampleReadStateManager); + } + + Assert.assertEquals(readStateCount, reads.size() - numReadStatesRemoved); + } + + // Allow memory used by this test to be reclaimed + readCountsPerAlignmentStart = null; + reads = null; + recordStatesByAlignmentStart = null; + } + + private void makeReads() { + int alignmentStart = 1; + + for ( int readsThisStack : readCountsPerAlignmentStart ) { + ArrayList stackReads = new ArrayList(ArtificialSAMUtils.createStackOfIdenticalArtificialReads(readsThisStack, header, "foo", 0, alignmentStart, MathUtils.randomIntegerInRange(50, 100))); + ArrayList stackRecordStates = new ArrayList(); + + for ( SAMRecord read : stackReads ) { + stackRecordStates.add(new LocusIteratorByStateExperimental.SAMRecordState(read)); + } + + reads.addAll(stackReads); + recordStatesByAlignmentStart.add(stackRecordStates); + } + } + } + + @DataProvider(name = "PerSampleReadStateManagerTestDataProvider") + public Object[][] createPerSampleReadStateManagerTests() { + for ( List thisTestReadStateCounts : Arrays.asList( Arrays.asList(1), + Arrays.asList(2), + Arrays.asList(10), + Arrays.asList(1, 1), + Arrays.asList(2, 2), + Arrays.asList(10, 10), + Arrays.asList(1, 10), + Arrays.asList(10, 1), + Arrays.asList(1, 1, 1), + Arrays.asList(2, 2, 2), + Arrays.asList(10, 10, 10), + Arrays.asList(1, 1, 1, 1, 1, 1), + Arrays.asList(10, 10, 10, 10, 10, 10), + Arrays.asList(1, 2, 10, 1, 2, 10) + ) ) { + + for ( int removalInterval : Arrays.asList(0, 2, 3) ) { + new PerSampleReadStateManagerTest(thisTestReadStateCounts, removalInterval); + } + } + + return PerSampleReadStateManagerTest.getTests(PerSampleReadStateManagerTest.class); + } + + @Test(dataProvider = "PerSampleReadStateManagerTestDataProvider") + public void runPerSampleReadStateManagerTest( PerSampleReadStateManagerTest test ) { + logger.warn("Running test: " + test); + + test.run(); + } + + /////////////////////////////////////// + // End Read State Manager Tests // + /////////////////////////////////////// + + + + /////////////////////////////////////// + // Helper methods / classes // + /////////////////////////////////////// + + private static ReadProperties createTestReadProperties() { + return createTestReadProperties(null); + } + + private static ReadProperties createTestReadProperties( DownsamplingMethod downsamplingMethod ) { + return new ReadProperties( + Collections.emptyList(), + new SAMFileHeader(), + SAMFileHeader.SortOrder.coordinate, + false, + SAMFileReader.ValidationStringency.STRICT, + downsamplingMethod, + new ValidationExclusion(), + Collections.emptyList(), + Collections.emptyList(), + false, + (byte) -1 + ); + } + + private static class FakeCloseableIterator implements CloseableIterator { + Iterator iterator; + + public FakeCloseableIterator(Iterator it) { + iterator = it; + } + + @Override + public void close() {} + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException("Don't remove!"); + } + } + + private static final class LIBS_position { + + SAMRecord read; + + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; + + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + + boolean sawMop = false; + + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } + + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } + + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) + return false; + + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; + + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; + break; + + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } + + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; + } + + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } + + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java index edd97f17f..83913fa76 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/LocusIteratorByStateUnitTest.java @@ -1,27 +1,28 @@ package org.broadinstitute.sting.gatk.iterators; -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMFileReader; -import net.sf.samtools.SAMRecord; +import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; -import org.broadinstitute.sting.gatk.filters.ReadFilter; -import org.broadinstitute.sting.utils.Utils; -import org.broadinstitute.sting.utils.pileup.PileupElement; -import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; -import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; +import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.baq.BAQ; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; /** * testing of the LocusIteratorByState @@ -37,7 +38,7 @@ public class LocusIteratorByStateUnitTest extends BaseTest { genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); } - private final LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { + private LocusIteratorByState makeLTBS(List reads, ReadProperties readAttributes) { return new LocusIteratorByState(new FakeCloseableIterator(reads.iterator()), readAttributes, genomeLocParser, LocusIteratorByState.sampleListForSAMWithoutReadGroups()); } @@ -255,20 +256,91 @@ public class LocusIteratorByStateUnitTest extends BaseTest { } } + //////////////////////////////////////////// + // comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////// + + private static class LIBSTest { + + + final String cigar; + final int readLength; + + private LIBSTest(final String cigar, final int readLength) { + this.cigar = cigar; + this.readLength = readLength; + } + } + + @DataProvider(name = "LIBSTest") + public Object[][] createLIBSTestData() { + + //TODO -- when LIBS is fixed this should be replaced to provide all possible permutations of CIGAR strings + + return new Object[][]{ + {new LIBSTest("1I", 1)}, + {new LIBSTest("10I", 10)}, + {new LIBSTest("2M2I2M", 6)}, + {new LIBSTest("2M2I", 4)}, + //TODO -- uncomment these when LIBS is fixed + //{new LIBSTest("2I2M", 4, Arrays.asList(2,3), Arrays.asList(IS_AFTER_INSERTION_FLAG,0))}, + //{new LIBSTest("1I1M1D1M", 3, Arrays.asList(0,1), Arrays.asList(IS_AFTER_INSERTION_FLAG | IS_BEFORE_DELETION_START_FLAG | IS_BEFORE_DELETED_BASE_FLAG,IS_AFTER_DELETED_BASE_FLAG | IS_AFTER_DELETION_END_FLAG))}, + //{new LIBSTest("1S1I1M", 3, Arrays.asList(2), Arrays.asList(IS_AFTER_INSERTION_FLAG))}, + //{new LIBSTest("1M2D2M", 3)}, + {new LIBSTest("1S1M", 2)}, + {new LIBSTest("1M1S", 2)}, + {new LIBSTest("1S1M1I", 3)} + }; + } + + @Test(dataProvider = "LIBSTest") + public void testLIBS(LIBSTest params) { + final int locus = 44367788; + + SAMRecord read = ArtificialSAMUtils.createArtificialRead(header, "read", 0, locus, params.readLength); + read.setReadBases(Utils.dupBytes((byte) 'A', params.readLength)); + read.setBaseQualities(Utils.dupBytes((byte) '@', params.readLength)); + read.setCigarString(params.cigar); + + // create the iterator by state with the fake reads and fake records + li = makeLTBS(Arrays.asList(read), createTestReadProperties()); + final LIBS_position tester = new LIBS_position(read); + + while ( li.hasNext() ) { + AlignmentContext alignmentContext = li.next(); + ReadBackedPileup p = alignmentContext.getBasePileup(); + Assert.assertTrue(p.getNumberOfElements() == 1); + PileupElement pe = p.iterator().next(); + + tester.stepForwardOnGenome(); + + Assert.assertEquals(pe.isBeforeDeletedBase(), tester.isBeforeDeletedBase); + Assert.assertEquals(pe.isBeforeDeletionStart(), tester.isBeforeDeletionStart); + Assert.assertEquals(pe.isAfterDeletedBase(), tester.isAfterDeletedBase); + Assert.assertEquals(pe.isAfterDeletionEnd(), tester.isAfterDeletionEnd); + Assert.assertEquals(pe.isBeforeInsertion(), tester.isBeforeInsertion); + Assert.assertEquals(pe.isAfterInsertion(), tester.isAfterInsertion); + Assert.assertEquals(pe.isNextToSoftClip(), tester.isNextToSoftClip); + Assert.assertEquals(pe.getOffset(), tester.getCurrentReadOffset()); + } + } + + //////////////////////////////////////////////// + // End comprehensive LIBS/PileupElement tests // + //////////////////////////////////////////////// + private static ReadProperties createTestReadProperties() { return new ReadProperties( Collections.emptyList(), new SAMFileHeader(), + SAMFileHeader.SortOrder.coordinate, false, SAMFileReader.ValidationStringency.STRICT, null, new ValidationExclusion(), Collections.emptyList(), + Collections.emptyList(), false, - BAQ.CalculationMode.OFF, - BAQ.QualityMode.DONT_MODIFY, - null, // no BAQ - null, // no BQSR (byte) -1 ); } @@ -282,9 +354,7 @@ class FakeCloseableIterator implements CloseableIterator { } @Override - public void close() { - return; - } + public void close() {} @Override public boolean hasNext() { @@ -301,3 +371,110 @@ class FakeCloseableIterator implements CloseableIterator { throw new UnsupportedOperationException("Don't remove!"); } } + + +final class LIBS_position { + + SAMRecord read; + + final int numOperators; + int currentOperatorIndex = 0; + int currentPositionOnOperator = 0; + int currentReadOffset = 0; + + boolean isBeforeDeletionStart = false; + boolean isBeforeDeletedBase = false; + boolean isAfterDeletionEnd = false; + boolean isAfterDeletedBase = false; + boolean isBeforeInsertion = false; + boolean isAfterInsertion = false; + boolean isNextToSoftClip = false; + + boolean sawMop = false; + + public LIBS_position(final SAMRecord read) { + this.read = read; + numOperators = read.getCigar().numCigarElements(); + } + + public int getCurrentReadOffset() { + return Math.max(0, currentReadOffset - 1); + } + + /** + * Steps forward on the genome. Returns false when done reading the read, true otherwise. + */ + public boolean stepForwardOnGenome() { + if ( currentOperatorIndex == numOperators ) + return false; + + CigarElement curElement = read.getCigar().getCigarElement(currentOperatorIndex); + if ( currentPositionOnOperator >= curElement.getLength() ) { + if ( ++currentOperatorIndex == numOperators ) + return false; + + curElement = read.getCigar().getCigarElement(currentOperatorIndex); + currentPositionOnOperator = 0; + } + + switch ( curElement.getOperator() ) { + case I: // insertion w.r.t. the reference + if ( !sawMop ) + break; + case S: // soft clip + currentReadOffset += curElement.getLength(); + case H: // hard clip + case P: // padding + currentOperatorIndex++; + return stepForwardOnGenome(); + + case D: // deletion w.r.t. the reference + case N: // reference skip (looks and gets processed just like a "deletion", just different logical meaning) + currentPositionOnOperator++; + break; + + case M: + case EQ: + case X: + sawMop = true; + currentReadOffset++; + currentPositionOnOperator++; + break; + default: + throw new IllegalStateException("No support for cigar op: " + curElement.getOperator()); + } + + final boolean isFirstOp = currentOperatorIndex == 0; + final boolean isLastOp = currentOperatorIndex == numOperators - 1; + final boolean isFirstBaseOfOp = currentPositionOnOperator == 1; + final boolean isLastBaseOfOp = currentPositionOnOperator == curElement.getLength(); + + isBeforeDeletionStart = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isLastOp, isLastBaseOfOp); + isBeforeDeletedBase = isBeforeDeletionStart || (!isLastBaseOfOp && curElement.getOperator() == CigarOperator.D); + isAfterDeletionEnd = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.D, isFirstOp, isFirstBaseOfOp); + isAfterDeletedBase = isAfterDeletionEnd || (!isFirstBaseOfOp && curElement.getOperator() == CigarOperator.D); + isBeforeInsertion = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isLastOp, isLastBaseOfOp) + || (!sawMop && curElement.getOperator() == CigarOperator.I); + isAfterInsertion = isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.I, isFirstOp, isFirstBaseOfOp); + isNextToSoftClip = isBeforeOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isLastOp, isLastBaseOfOp) + || isAfterOp(read.getCigar(), currentOperatorIndex, CigarOperator.S, isFirstOp, isFirstBaseOfOp); + + return true; + } + + private static boolean isBeforeOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isLastOp, + final boolean isLastBaseOfOp) { + return !isLastOp && isLastBaseOfOp && cigar.getCigarElement(currentOperatorIndex+1).getOperator() == op; + } + + private static boolean isAfterOp(final Cigar cigar, + final int currentOperatorIndex, + final CigarOperator op, + final boolean isFirstOp, + final boolean isFirstBaseOfOp) { + return !isFirstOp && isFirstBaseOfOp && cigar.getCigarElement(currentOperatorIndex-1).getOperator() == op; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java index 3b5d8d6b7..f0d7f83dc 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/iterators/VerifyingSamIteratorUnitTest.java @@ -28,14 +28,12 @@ import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceRecord; -import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.testng.Assert; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -48,7 +46,6 @@ import java.util.List; */ public class VerifyingSamIteratorUnitTest { private SAMFileHeader samFileHeader; - private GenomeLocParser genomeLocParser; @BeforeClass public void init() { @@ -58,8 +55,6 @@ public class VerifyingSamIteratorUnitTest { samFileHeader = new SAMFileHeader(); samFileHeader.setSequenceDictionary(sequenceDictionary); - - genomeLocParser = new GenomeLocParser(sequenceDictionary); } @Test @@ -68,7 +63,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),2,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -83,7 +78,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(1).getSequenceIndex(),1,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -98,7 +93,7 @@ public class VerifyingSamIteratorUnitTest { SAMRecord read2 = ArtificialSAMUtils.createArtificialRead(samFileHeader,"read2",getContig(0).getSequenceIndex(),1,10); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); @@ -116,7 +111,7 @@ public class VerifyingSamIteratorUnitTest { read1.setReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); List reads = Arrays.asList(read1,read2); - VerifyingSamIterator iterator = new VerifyingSamIterator(genomeLocParser,StingSAMIteratorAdapter.adapt(reads.iterator())); + VerifyingSamIterator iterator = new VerifyingSamIterator(StingSAMIteratorAdapter.adapt(reads.iterator())); Assert.assertTrue(iterator.hasNext(),"Insufficient reads"); Assert.assertSame(iterator.next(),read1,"Incorrect read in read 1 position"); diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java deleted file mode 100644 index 2198c461d..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/ReadMetaDataTrackerUnitTest.java +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (c) 2010. The Broad Institute - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.refdata; - -import net.sf.samtools.SAMFileHeader; -import net.sf.samtools.SAMRecord; -import org.testng.Assert; -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.gatk.datasources.providers.RODMetaDataContainer; -import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; - -import org.testng.annotations.BeforeMethod; - -import org.testng.annotations.BeforeClass; -import org.testng.annotations.Test; - -import java.util.*; - - -/** - * @author aaron - *

    - * Class ReadMetaDataTrackerUnitTest - *

    - * test out the ReadMetaDataTracker - */ -public class ReadMetaDataTrackerUnitTest extends BaseTest { - private static int startingChr = 1; - private static int endingChr = 2; - private static int readCount = 100; - private static int DEFAULT_READ_LENGTH = ArtificialSAMUtils.DEFAULT_READ_LENGTH; - private static SAMFileHeader header; - private Set nameSet; - - private GenomeLocParser genomeLocParser; - - @BeforeClass - public void beforeClass() { - header = ArtificialSAMUtils.createArtificialSamHeader((endingChr - startingChr) + 1, startingChr, readCount + DEFAULT_READ_LENGTH); - genomeLocParser = new GenomeLocParser(header.getSequenceDictionary()); - } - - @BeforeMethod - public void beforeEach() { - nameSet = new TreeSet(); - nameSet.add("default"); - } - - @Test - public void twoRodsAtEachReadBase() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getReadOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getReadOffsetMapping().get(x).size(), 2); - } - Assert.assertEquals(count, 10); - } - - @Test - public void rodAtEachReadBase() { - - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getReadOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getReadOffsetMapping().get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - @Test - public void filterByName() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping("default"); - for (Integer x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - @Test - public void filterByDupType() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, false); // create both RODs of the same type - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping(FakeRODatum.class); - for (Integer x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), 2); - } - Assert.assertEquals(count, 10); - } - - // @Test this test can be uncommented to determine the speed impacts of any changes to the RODs for reads system - - public void filterByMassiveDupType() { - - for (int y = 0; y < 20; y++) { - nameSet.add("default" + String.valueOf(y)); - long firstTime = System.currentTimeMillis(); - for (int lp = 0; lp < 1000; lp++) { - ReadMetaDataTracker tracker = getRMDT(1, nameSet, false); // create both RODs of the same type - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping(FakeRODatum.class); - for (Integer x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), y + 2); - } - Assert.assertEquals(count, 10); - } - System.err.println(y + " = " + (System.currentTimeMillis() - firstTime)); - } - } - - - @Test - public void filterByType() { - nameSet.add("default2"); - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - Map> map = tracker.getReadOffsetMapping(Fake2RODatum.class); - for (int x : map.keySet()) { - count++; - Assert.assertEquals(map.get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - @Test - public void sparceRODsForRead() { - ReadMetaDataTracker tracker = getRMDT(7, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getReadOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getReadOffsetMapping().get(x).size(), 1); - } - Assert.assertEquals(count, 2); - } - - @Test - public void rodByGenomeLoc() { - ReadMetaDataTracker tracker = getRMDT(1, nameSet, true); - - // count the positions - int count = 0; - for (Integer x : tracker.getContigOffsetMapping().keySet()) { - count++; - Assert.assertEquals(tracker.getContigOffsetMapping().get(x).size(), 1); - } - Assert.assertEquals(count, 10); - } - - - /** - * create a ReadMetaDataTracker given: - * - * @param incr the spacing between site locations - * @param names the names of the reference ordered data to create: one will be created at every location for each name - * - * @return a ReadMetaDataTracker - */ - private ReadMetaDataTracker getRMDT(int incr, Set names, boolean alternateTypes) { - SAMRecord record = ArtificialSAMUtils.createArtificialRead(header, "name", 0, 1, 10); - TreeMap data = new TreeMap(); - for (int x = 0; x < record.getAlignmentEnd(); x += incr) { - GenomeLoc loc = genomeLocParser.createGenomeLoc(record.getReferenceName(), record.getAlignmentStart() + x, record.getAlignmentStart() + x); - RODMetaDataContainer set = new RODMetaDataContainer(); - - int cnt = 0; - for (String name : names) { - if (alternateTypes) - set.addEntry((cnt % 2 == 0) ? new FakeRODatum(loc, name) : new Fake2RODatum(loc, name)); - else - set.addEntry(new FakeRODatum(loc, name)); - cnt++; - } - data.put(record.getAlignmentStart() + x, set); - } - ReadMetaDataTracker tracker = new ReadMetaDataTracker(genomeLocParser, record, data); - return tracker; - } - - - /** for testing, we want a fake rod with a different classname, for the get-by-class-name functions */ - static public class Fake2RODatum extends FakeRODatum { - - public Fake2RODatum(GenomeLoc location, String name) { - super(location, name); - } - } - - - /** for testing only */ - static public class FakeRODatum extends GATKFeature { - - final GenomeLoc location; - final String name; - - public FakeRODatum(GenomeLoc location, String name) { - super(name); - this.location = location; - this.name = name; - } - - @Override - public String getName() { - return name; - } - - @Override - public GenomeLoc getLocation() { - return this.location; - } - - @Override - public Object getUnderlyingObject() { - return null; //To change body of implemented methods use File | Settings | File Templates. - } - - @Override - public String getChr() { - return location.getContig(); - } - - @Override - public int getStart() { - return (int)this.location.getStart(); - } - - @Override - public int getEnd() { - return (int)this.location.getStop(); - } - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java index 91c18078e..2f73e373c 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/refdata/RefMetaDataTrackerUnitTest.java @@ -133,7 +133,7 @@ public class RefMetaDataTrackerUnitTest { List x = new ArrayList(); if ( AValues != null ) x.add(AValues); if ( BValues != null ) x.add(BValues); - return new RefMetaDataTracker(x, context); + return new RefMetaDataTracker(x); } public int nBoundTracks() { diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java index 7845515d8..bf1fc9e65 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseReadsUnitTest.java @@ -6,13 +6,12 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.ReadShardBalancer; import org.broadinstitute.sting.gatk.datasources.reads.SAMDataSource; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.Shard; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.qc.CountReads; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -62,9 +61,9 @@ public class TraverseReadsUnitTest extends BaseTest { private SAMReaderID bam = new SAMReaderID(new File(validationDataLocation + "index_test.bam"),new Tags()); // TCGA-06-0188.aligned.duplicates_marked.bam"); private File refFile = new File(validationDataLocation + "Homo_sapiens_assembly17.fasta"); private List bamList; - private Walker countReadWalker; + private ReadWalker countReadWalker; private File output; - private TraverseReads traversalEngine = null; + private TraverseReadsNano traversalEngine = null; private IndexedFastaSequenceFile ref = null; private GenomeLocParser genomeLocParser = null; @@ -107,7 +106,7 @@ public class TraverseReadsUnitTest extends BaseTest { bamList.add(bam); countReadWalker = new CountReads(); - traversalEngine = new TraverseReads(); + traversalEngine = new TraverseReadsNano(1); traversalEngine.initialize(engine); } @@ -121,13 +120,11 @@ public class TraverseReadsUnitTest extends BaseTest { Object accumulator = countReadWalker.reduceInit(); for(Shard shard: shardStrategy) { - traversalEngine.startTimersIfNecessary(); - if (shard == null) { fail("Shard == null"); } - ShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null); + ReadShardDataProvider dataProvider = new ReadShardDataProvider(shard,genomeLocParser,dataSource.seek(shard),null,null); accumulator = traversalEngine.traverse(countReadWalker, dataProvider, accumulator); dataProvider.close(); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java index 9d9b91872..e16ef3125 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PileupWalkerIntegrationTest.java @@ -5,15 +5,7 @@ import org.testng.annotations.Test; import java.util.Arrays; -/** - * Created by IntelliJ IDEA. - * User: chartl - * Date: Dec 1, 2009 - * Time: 9:03:34 AM - * To change this template use File | Settings | File Templates. - */ public class PileupWalkerIntegrationTest extends WalkerTest { - @Test public void testGnarleyFHSPileup() { String gatk_args = "-T Pileup -I " + validationDataLocation + "FHS_Pileup_Test.bam " @@ -23,4 +15,28 @@ public class PileupWalkerIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(expected_md5)); executeTest("Testing the standard (no-indel) pileup on three merged FHS pools with 27 deletions in 969 bases", spec); } + + + + private final static String SingleReadAligningOffChromosome1MD5 = "4a45fe1f85aaa8c4158782f2b6dee2bd"; + @Test + public void testSingleReadAligningOffChromosome1() { + String gatk_args = "-T Pileup " + + " -I " + privateTestDir + "readOffb37contig1.bam" + + " -R " + b37KGReference + + " -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); + executeTest("Testing single read spanning off chromosome 1", spec); + } + + @Test + public void testSingleReadAligningOffChromosome1NoIndex() { + String gatk_args = "-T Pileup " + + " -I " + privateTestDir + "readOffb37contig1.noIndex.bam" + + " -R " + b37KGReference + + " -U ALLOW_UNINDEXED_BAM" + + " -o %s"; + WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 1, Arrays.asList(SingleReadAligningOffChromosome1MD5)); + executeTest("Testing single read spanning off chromosome 1 unindexed", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java index 057cf1cf9..717d9d953 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/PrintReadsIntegrationTest.java @@ -38,7 +38,8 @@ public class PrintReadsIntegrationTest extends WalkerTest { {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1", "6e920b8505e7e95d67634b0905237dbc")}, {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L unmapped", "13bb9a91b1d4dd2425f73302b8a1ac1c")}, {new PRTest(b37KGReference, "unmappedFlagReadsInLastLinearBin.bam", " -L 1 -L unmapped", "6e920b8505e7e95d67634b0905237dbc")}, - {new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "6caec4f8a25befb6aba562955401af93")} + {new PRTest(b37KGReference, "oneReadAllInsertion.bam", "", "6caec4f8a25befb6aba562955401af93")}, + {new PRTest(b37KGReference, "NA12878.1_10mb_2_10mb.bam", "", "c43380ac39b98853af457b90e52f8427")} }; } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 17d27c156..01dff0089 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("95b0627bfcac2191aed9908904e892ff")); + Arrays.asList("fbfbd4d13b7ba3d76e8e186902e81378")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("0e2509349fd6c8a9e9408c918215e1de")); + Arrays.asList("19aef8914efc497192f89a9038310ca5")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("32d81a7797605afb526983a2ab45efc2")); + Arrays.asList("4f0b8033da18e6cf6e9b8d5d36c21ba2")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("350539ccecea0d1f7fffd4ac29c015e7")); + Arrays.asList("64ca176d587dfa2b3b9dec9f7999305c")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("c222361819fae035a0162f876990fdee")); + Arrays.asList("0c810f6c4abef9d9dc5513ca872d3d22")); executeTest("test overwriting header", spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java index f1ffbe80f..220ffa1e1 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; +import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.report.GATKReport; import org.broadinstitute.sting.gatk.report.GATKReportTable; import org.broadinstitute.sting.utils.recalibration.RecalUtils; @@ -7,49 +8,70 @@ import org.testng.Assert; import org.testng.annotations.Test; import java.io.File; +import java.util.Arrays; import java.util.LinkedList; import java.util.List; /** - * @author Mauricio Carneiro - * @since 3/7/12 + * @author Eric Banks + * @since 9/20/12 */ -public class BQSRGathererUnitTest { - RecalibrationArgumentCollection RAC; +public class BQSRGathererUnitTest extends BaseTest { - private static File recal = new File("public/testdata/exampleGRP.grp"); + private static File recal1 = new File(privateTestDir + "HiSeq.1mb.1RG.sg1.table"); + private static File recal2 = new File(privateTestDir + "HiSeq.1mb.1RG.sg2.table"); + private static File recal3 = new File(privateTestDir + "HiSeq.1mb.1RG.sg3.table"); + private static File recal4 = new File(privateTestDir + "HiSeq.1mb.1RG.sg4.table"); + private static File recal5 = new File(privateTestDir + "HiSeq.1mb.1RG.sg5.table"); - //todo -- this test doesnt work because the primary keys in different tables are not the same. Need to either implement "sort" for testing purposes on GATKReport or have a sophisticated comparison measure - @Test(enabled = false) - public void testCombineSimilarFiles() { + private static File recal_original = new File(privateTestDir + "HiSeq.1mb.1RG.noSG.table"); + + @Test(enabled = true) + public void testGatherBQSR() { BQSRGatherer gatherer = new BQSRGatherer(); List recalFiles = new LinkedList (); - File output = new File("foo.grp"); - recalFiles.add(recal); - recalFiles.add(recal); + final File output = BaseTest.createTempFile("BQSRgathererTest", ".table"); + + recalFiles.add(recal1); + recalFiles.add(recal2); + recalFiles.add(recal3); + recalFiles.add(recal4); + recalFiles.add(recal5); gatherer.gather(recalFiles, output); - GATKReport originalReport = new GATKReport(recal); - GATKReport calculatedReport = new GATKReport(output); - for (GATKReportTable originalTable : originalReport.getTables()) { - GATKReportTable calculatedTable = calculatedReport.getTable(originalTable.getTableName()); - List columnsToTest = new LinkedList(); - columnsToTest.add(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME); - columnsToTest.add(RecalUtils.NUMBER_ERRORS_COLUMN_NAME); - if (originalTable.getTableName().equals(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE)) { // these tables must be IDENTICAL - columnsToTest.add(RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); - testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 1); - } - - else if (originalTable.getTableName().equals(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE)) { - columnsToTest.add(RecalUtils.QUANTIZED_COUNT_COLUMN_NAME); - testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2); - } - - else if (originalTable.getTableName().startsWith("RecalTable")) { - testTablesWithColumnsAndFactor(originalTable, calculatedTable, columnsToTest, 2); - } - } + GATKReport originalReport = new GATKReport(recal_original); + GATKReport calculatedReport = new GATKReport(output); + + + // test the Arguments table + List columnsToTest = Arrays.asList(RecalUtils.ARGUMENT_COLUMN_NAME, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); + GATKReportTable originalTable = originalReport.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); + GATKReportTable calculatedTable = calculatedReport.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); + testTablesWithColumns(originalTable, calculatedTable, columnsToTest); + + // test the Quantized table + columnsToTest = Arrays.asList(RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.QUANTIZED_COUNT_COLUMN_NAME, RecalUtils.QUANTIZED_VALUE_COLUMN_NAME); + originalTable = originalReport.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE); + calculatedTable = calculatedReport.getTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE); + testTablesWithColumns(originalTable, calculatedTable, columnsToTest); + + // test the RecalTable0 table + columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); + originalTable = originalReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE); + calculatedTable = calculatedReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE); + testTablesWithColumns(originalTable, calculatedTable, columnsToTest); + + // test the RecalTable1 table + columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); + originalTable = originalReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE); + calculatedTable = calculatedReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE); + testTablesWithColumns(originalTable, calculatedTable, columnsToTest); + + // test the RecalTable2 table + columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.COVARIATE_VALUE_COLUMN_NAME, RecalUtils.COVARIATE_NAME_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); + originalTable = originalReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE); + calculatedTable = calculatedReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE); + testTablesWithColumns(originalTable, calculatedTable, columnsToTest); } /** @@ -58,25 +80,12 @@ public class BQSRGathererUnitTest { * @param original the original table * @param calculated the calculated table * @param columnsToTest list of columns to test. All columns will be tested with the same criteria (equality given factor) - * @param factor 1 to test for equality, any other value to multiply the original value and match with the calculated */ - private void testTablesWithColumnsAndFactor(GATKReportTable original, GATKReportTable calculated, List columnsToTest, int factor) { + private void testTablesWithColumns(GATKReportTable original, GATKReportTable calculated, List columnsToTest) { for (int row = 0; row < original.getNumRows(); row++ ) { for (String column : columnsToTest) { Object actual = calculated.get(new Integer(row), column); Object expected = original.get(row, column); - - if (factor != 1) { - if (expected instanceof Double) - expected = (Double) expected * factor; - else if (expected instanceof Long) - expected = (Long) expected * factor; - else if (expected instanceof Integer) - expected = (Integer) expected * factor; - else if (expected instanceof Byte) { - expected = (Byte) expected * factor; - } - } Assert.assertEquals(actual, expected, "Row: " + row + " Original Table: " + original.getTableName() + " Calc Table: " + calculated.getTableName()); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java index 6f1370008..9bec1b75d 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverageIntegrationTest.java @@ -9,7 +9,7 @@ import java.util.Arrays; import java.util.List; /** - * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl + * Integration tests for the Depth of Coverage walker * * @Author chartl * @Date Feb 25, 2010 diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java index 97b985a29..27e5f3d46 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/filters/VariantFiltrationIntegrationTest.java @@ -108,4 +108,22 @@ public class VariantFiltrationIntegrationTest extends WalkerTest { Arrays.asList("8ed32a2272bab8043a255362335395ef")); executeTest("testUnfilteredBecomesFilteredAndPass", spec); } + + @Test + public void testFilteringDPfromINFO() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --filterExpression 'DP < 8' --filterName lowDP -V " + privateTestDir + "filteringDepthInFormat.vcf", 1, + Arrays.asList("a01f7cce53ea556c9741aa60b6124c41")); + executeTest("testFilteringDPfromINFO", spec); + } + + @Test + public void testFilteringDPfromFORMAT() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T VariantFiltration -o %s --no_cmdline_in_header -R " + b37KGReference + + " --genotypeFilterExpression 'DP < 8' --genotypeFilterName lowDP -V " + privateTestDir + "filteringDepthInFormat.vcf", 1, + Arrays.asList("e10485c7c33d9211d0c1294fd7858476")); + executeTest("testFilteringDPfromFORMAT", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java deleted file mode 100644 index 306dddd65..000000000 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/ExactAFCalculationModelUnitTest.java +++ /dev/null @@ -1,112 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.genotyper; - -import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.Genotype; -import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; -import org.broadinstitute.sting.utils.variantcontext.GenotypesContext; -import org.testng.Assert; -import org.testng.annotations.BeforeSuite; -import org.testng.annotations.DataProvider; -import org.testng.annotations.Test; - -import java.util.Arrays; - - -public class ExactAFCalculationModelUnitTest extends BaseTest { - - static double[] AA1, AB1, BB1; - static double[] AA2, AB2, AC2, BB2, BC2, CC2; - static final int numSamples = 3; - static double[] priors = new double[2*numSamples+1]; // flat priors - - @BeforeSuite - public void before() { - AA1 = new double[]{0.0, -20.0, -20.0}; - AB1 = new double[]{-20.0, 0.0, -20.0}; - BB1 = new double[]{-20.0, -20.0, 0.0}; - AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0}; - AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0}; - AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0}; - BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0}; - BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0}; - CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0}; - } - - private class GetGLsTest extends TestDataProvider { - GenotypesContext GLs; - int numAltAlleles; - String name; - - private GetGLsTest(String name, int numAltAlleles, Genotype... arg) { - super(GetGLsTest.class, name); - GLs = GenotypesContext.create(arg); - this.name = name; - this.numAltAlleles = numAltAlleles; - } - - public String toString() { - return String.format("%s input=%s", super.toString(), GLs); - } - } - - private static Genotype createGenotype(String name, double[] gls) { - return new GenotypeBuilder(name, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL)).PL(gls).make(); - } - - @DataProvider(name = "getGLs") - public Object[][] createGLsData() { - - // bi-allelic case - new GetGLsTest("B0", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AA3", AA1)); - new GetGLsTest("B1", 1, createGenotype("AA1", AA1), createGenotype("AA2", AA1), createGenotype("AB", AB1)); - new GetGLsTest("B2", 1, createGenotype("AA1", AA1), createGenotype("BB", BB1), createGenotype("AA2", AA1)); - new GetGLsTest("B3a", 1, createGenotype("AB", AB1), createGenotype("AA", AA1), createGenotype("BB", BB1)); - new GetGLsTest("B3b", 1, createGenotype("AB1", AB1), createGenotype("AB2", AB1), createGenotype("AB3", AB1)); - new GetGLsTest("B4", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("AA", AA1)); - new GetGLsTest("B5", 1, createGenotype("BB1", BB1), createGenotype("AB", AB1), createGenotype("BB2", BB1)); - new GetGLsTest("B6", 1, createGenotype("BB1", BB1), createGenotype("BB2", BB1), createGenotype("BB3", BB1)); - - // tri-allelic case - new GetGLsTest("B1C0", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AB", AB2)); - new GetGLsTest("B0C1", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("AC", AC2)); - new GetGLsTest("B1C1a", 2, createGenotype("AA", AA2), createGenotype("AB", AB2), createGenotype("AC", AC2)); - new GetGLsTest("B1C1b", 2, createGenotype("AA1", AA2), createGenotype("AA2", AA2), createGenotype("BC", BC2)); - new GetGLsTest("B2C1", 2, createGenotype("AB1", AB2), createGenotype("AB2", AB2), createGenotype("AC", AC2)); - new GetGLsTest("B3C2a", 2, createGenotype("AB", AB2), createGenotype("BC1", BC2), createGenotype("BC2", BC2)); - new GetGLsTest("B3C2b", 2, createGenotype("AB", AB2), createGenotype("BB", BB2), createGenotype("CC", CC2)); - - return GetGLsTest.getTests(GetGLsTest.class); - } - - - @Test(dataProvider = "getGLs") - public void testGLs(GetGLsTest cfg) { - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - int nameIndex = 1; - for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { - int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); - int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; - - Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); - } - } - - @Test - public void testLargeGLs() { - - final double[] BB = new double[]{-20000000.0, -20000000.0, 0.0}; - GetGLsTest cfg = new GetGLsTest("B6", 1, createGenotype("1", BB), createGenotype("2", BB), createGenotype("3", BB)); - - final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(2); - - ExactAFCalculationModel.linearExactMultiAllelic(cfg.GLs, cfg.numAltAlleles, priors, result); - - int calculatedAlleleCount = result.getAlleleCountsOfMAP()[0]; - Assert.assertEquals(calculatedAlleleCount, 6); - } -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 7b6e1ee96..9212d0e53 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -7,6 +7,7 @@ import org.testng.annotations.Test; import java.io.File; import java.util.Arrays; +import java.util.Collections; import java.util.List; // ********************************************************************************** // @@ -15,9 +16,10 @@ import java.util.List; public class UnifiedGenotyperIntegrationTest extends WalkerTest { - private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; - private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndels = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + private final static String baseCommandIndelsb37 = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -mbq 20 --dbsnp " + b37dbSNP132; + private final static String baseCommandNoCmdLineHeaderStdout = "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam"; // -------------------------------------------------------------------------------------------------------------- // @@ -28,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("0039fd0464c87e6ce66c4c8670fd8dfa")); + Arrays.asList("cdec335abc9ad8e59335e39a73e0e95a")); executeTest("test MultiSample Pilot1", spec); } @@ -36,7 +38,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("d1e68d4db6585ec00213b1d2d05e01a9")); + Arrays.asList("efddb5e258f97fd4f6661cff9eaa57de")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -44,7 +46,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("b53860d209f8440f12b78d01606553e1")); + Arrays.asList("24532eb381724cd74e99370da28d49ed")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -52,22 +54,22 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("61007c22c00a2871237280914a8f88f0")); + Arrays.asList("062a946160eec1d0fc135d58ca654ff4")); executeTest("test SingleSample Pilot2", spec); } @Test public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("feda4a38bba096f7b740a146055509c2")); + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, + Arrays.asList("94dc17d76d841f1d3a36160767ffa034")); executeTest("test Multiple SNP alleles", spec); } @Test public void testBadRead() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, Arrays.asList("d915535c1458733f09f82670092fcab6")); executeTest("test bad read", spec); } @@ -75,18 +77,26 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("0ff525e65c5836289c454c76ead5d80e")); + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, + Arrays.asList("9106d01ca0d0a8fedd068e72d509f380")); executeTest("test reverse trim", spec); } + @Test + public void testMismatchedPLs() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, + Arrays.asList("d847acf841ba8ba653f996ce4869f439")); + executeTest("test mismatched PLs", spec); + } + // -------------------------------------------------------------------------------------------------------------- // // testing compressed output // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "e1a17f8f852c3d639f26e659d37bc1e5"; + private final static String COMPRESSED_OUTPUT_MD5 = "6792419c482e767a3deb28913ed2b1ad"; @Test public void testCompressedOutput() { @@ -107,24 +117,24 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "306943dd63111e2e64388cd2e2de6c01"; + String md5 = "d408b4661b820ed86272415b8ea08780"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( - baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, + baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, Arrays.asList(md5)); executeTest("test parallelization (single thread)", spec1); GenomeAnalysisEngine.resetRandomGenerator(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, + baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 2", 1, Arrays.asList(md5)); executeTest("test parallelization (2 threads)", spec2); GenomeAnalysisEngine.resetRandomGenerator(); WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( - baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, + baseCommand + " -dt NONE -G none --contamination_fraction_to_filter 0.0 -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000 -nt 4", 1, Arrays.asList(md5)); executeTest("test parallelization (4 threads)", spec3); } @@ -139,15 +149,15 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("b0b92abbaaa4c787dce6f1b302f983ee")); + Arrays.asList("56157d930da6ccd224bce1ca93f11e41")); executeTest("test min_base_quality_score 26", spec); } @Test public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("186d33429756c89aad6cd89424d6dc94")); + "-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("6ccb9bd88934e4272d0ce362dd35e603")); executeTest("test SLOD", spec); } @@ -155,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("11b87f68b8530da168c1418513115f30")); + Arrays.asList("480437dd6e2760f4ab3194431519f331")); executeTest("test NDA", spec); } @@ -163,23 +173,31 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("d2be4b1af1f29579c4f96c08e1ddd871")); + Arrays.asList("22c039412fd387dde6125b07c9a74a25")); executeTest("test using comp track", spec); } + @Test + public void testNoCmdLineHeaderStdout() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommandNoCmdLineHeaderStdout + " -glm INDEL -L 1:67,225,396-67,288,518", 0, + Collections.emptyList()); + executeTest("testNoCmdLineHeaderStdout", spec); + } + @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "0055bd060e6ef53a6b836903d68953c9"); + testOutputParameters("-sites_only", "40aeb4c9e31fe7046b72afc58e7599cb"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "235bec0a7b2d901442261104db18f5eb"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "c706ca93b25ff83613cb4e95dcac567c"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "7c57ede7019063c19aa9d2136045d84f"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "8a263fd0a94463ce1de9990f2b8ec841"); } private void testOutputParameters(final String args, final String md5) { @@ -193,18 +211,10 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("3f8d724a5158adac4df38c4e2ed04167")); + Arrays.asList("df524e98903d96ab9353bee7c16a69de")); executeTest("test confidence 1", spec1); } - @Test - public void testConfidence2() { - WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( - baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_emit_conf 10 ", 1, - Arrays.asList("3f8d724a5158adac4df38c4e2ed04167")); - executeTest("test confidence 2", spec2); - } - // -------------------------------------------------------------------------------------------------------------- // // testing heterozygosity @@ -212,12 +222,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "7e7384a3a52e19f76f368c2f4561d510" ); + testHeterozosity( 0.01, "8e61498ca03a8d805372a64c466b3b42" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "3d16366d870c086e894c07c9da411795" ); + testHeterozosity( 1.0 / 1850, "668d06b5173cf3b97d052726988e1d7b" ); } private void testHeterozosity(final double arg, final String md5) { @@ -241,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("58abc4f504d3afd42271e290ac846c4b")); + Arrays.asList("908eb5e21fa39e7fb377cf4a9c4c7835")); executeTest(String.format("test multiple technologies"), spec); } @@ -260,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("e247f579f01eb698cfa1ae1e8a3995a8")); + Arrays.asList("c814558bb0ed2e19b12e1a2bf4465d52")); executeTest(String.format("test calling with BAQ"), spec); } @@ -279,7 +289,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("cc2167dce156f70f5a31ac3dce499266")); + Arrays.asList("3593495aab5f6204c65de0b073a6ff65")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -294,7 +304,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("1268bde77842e6bb6a4f337c1d589f4d")); + Arrays.asList("8b486a098029d5a106b0a37eff541c15")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -307,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("10c86ff98ad5ab800d208b435bcfbd7d")); + Arrays.asList("18efedc50cae2aacaba372265e38310b")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -317,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("c0c4dbb050296633a3150b104b77e05a")); + Arrays.asList("3ff8c7c80a518aa3eb8671a21479de5f")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -327,7 +337,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("2472722f87f8718861698f60bbba2462")); + Arrays.asList("578c0540f4f2052a634a829bcb9cc27d")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -335,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("eeb64b261f0a44aa478d753dbbf9378e")); + Arrays.asList("f7d0d0aee603df25c1f0525bb8df189e")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("d0a66c234056bb83dd84113bc2421f1e")); + Arrays.asList("fc91d457a16b4ca994959c2b5f3f0352")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -351,7 +361,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("db0f91abb901e097714d8755058e1319")); + Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); } @@ -363,7 +373,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("b3c923ed9efa04b85fc18a9b45c8d2a6")); + Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -397,7 +407,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("160600dfa8e46f91dbb5d574517aac74")); + Arrays.asList("857b8e5df444463ac27f665c4f67fbe2")); executeTest("test minIndelFraction 0.0", spec); } @@ -405,7 +415,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("aa58dc9f77132c30363562bcdc321f6e")); + Arrays.asList("81d4c7d9010fd6733b2997bc378e7471")); executeTest("test minIndelFraction 0.25", spec); } @@ -426,8 +436,56 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( - "-T UnifiedGenotyper -R " + b37KGReference + " -nosl --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1, - Arrays.asList("22c9fd65ce3298bd7fbf400c9c209f29")); + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1, + Arrays.asList("bd7984a374f0ae5d277bd5fc5065f64f")); executeTest("test calling on reads with Ns in CIGAR", spec); } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing reduced reads + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testReducedBam() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, + Arrays.asList("9a7cd58b9e3d5b72608c0d529321deba")); + executeTest("test calling on a ReducedRead BAM", spec); + } + + @Test + public void testReducedBamSNPs() { + testReducedCalling("SNP", "e7fc11baf208a1bca7b462d3148c936e"); + } + + @Test + public void testReducedBamINDELs() { + testReducedCalling("INDEL", "132a4e0ccf9230b5bb4b56c649e2bdd5"); + } + + + private void testReducedCalling(final String model, final String md5) { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.reduced.bam -o %s -L 20:10,000,000-11,000,000 -glm " + model, 1, + Arrays.asList(md5)); + executeTest("test calling on a ReducedRead BAM with " + model, spec); + } + + // -------------------------------------------------------------------------------------------------------------- + // + // testing contamination down-sampling + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testContaminationDownsampling() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_to_filter 0.20", 1, + Arrays.asList("27dd04159e06d9524fb8a4eef41f96ae")); + executeTest("test contamination_percentage_to_filter 0.20", spec); + } + + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java index 040845828..9b464cfec 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerIntegrationTest.java @@ -113,4 +113,14 @@ public class IndelRealignerIntegrationTest extends WalkerTest { executeTest(String.format("realigner [%s]", entry.getKey()), spec); } } + + @Test + public void testNWayOut() { + WalkerTestSpec spec1 = new WalkerTestSpec( + baseCommandPrefix + " -nWayOut .clean.bam ", + 1, + Arrays.asList("d41d8cd98f00b204e9800998ecf8427e")); + executeTest("test realigner nWayOut", spec1); + } + } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java index c92d6d4cf..7a3a20bdd 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalIntegrationTest.java @@ -354,7 +354,7 @@ public class VariantEvalIntegrationTest extends WalkerTest { @Test public void testCompOverlap() { String extraArgs = "-T VariantEval -R " + b37KGReference + " -L " + variantEvalTestDataRoot + "pacbio.hg19.intervals --comp:comphapmap " + comparisonDataLocation + "Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf --eval " + variantEvalTestDataRoot + "pacbio.ts.recalibrated.vcf -noEV -EV CompOverlap -sn NA12878 -noST -ST Novelty -o %s"; - WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("59ad39e03678011b5f62492fa83ede04")); + WalkerTestSpec spec = new WalkerTestSpec(extraArgs,1,Arrays.asList("d0d9208060e69e157dac3bf01bdd83b0")); executeTestParallel("testCompOverlap",spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java index b780bcd00..0c4924229 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrationWalkersIntegrationTest.java @@ -1,10 +1,10 @@ package org.broadinstitute.sting.gatk.walkers.variantrecalibration; import org.broadinstitute.sting.WalkerTest; -import org.testng.annotations.Test; import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; -import java.util.*; +import java.util.Arrays; public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { private static class VRTest { @@ -26,9 +26,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { } VRTest lowPass = new VRTest(validationDataLocation + "phase1.projectConsensus.chr20.raw.snps.vcf", - "f360ce3eb2b0b887301be917a9843e2b", // tranches - "287fea5ea066bf3fdd71f5ce9b58eab3", // recal file - "356b9570817b9389da71fbe991d8b2f5"); // cut VCF + "4d08c8eee61dd1bdea8c5765f34e41f0", // tranches + "ce396fe4045e020b61471f6737dff36e", // recal file + "4f59bd61be900b25c6ecedaa68b9c8de"); // cut VCF @DataProvider(name = "VRTest") public Object[][] createData1() { @@ -75,9 +75,9 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { } VRTest bcfTest = new VRTest(privateTestDir + "vqsr.bcf_test.snps.unfiltered.bcf", - "a8ce3cd3dccafdf7d580bcce7d660a9a", // tranches - "74c10fc15f9739a938b7138909fbde04", // recal file - "62fda105e14b619a1c263855cf56af1d"); // cut VCF + "6a1eef4d02857dbb117a15420b5c0ce9", // tranches + "238366af66b05b6d21749e799c25353d", // recal file + "3928d6bc5007becf52312ade70f14c42"); // cut VCF @DataProvider(name = "VRBCFTest") public Object[][] createVRBCFTest() { @@ -129,13 +129,13 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { validationDataLocation + "combined.phase1.chr20.raw.indels.unfiltered.sites.vcf", // all FILTERs as . "b7589cd098dc153ec64c02dcff2838e4", // tranches "a04a9001f62eff43d363f4d63769f3ee", // recal file - "64f576881e21323dd4078262604717a2"); // cut VCF + "b2c6827be592c24a4692b1753edc7d23"); // cut VCF VRTest indelFiltered = new VRTest( validationDataLocation + "combined.phase1.chr20.raw.indels.filtered.sites.vcf", // all FILTERs as PASS "b7589cd098dc153ec64c02dcff2838e4", // tranches "a04a9001f62eff43d363f4d63769f3ee", // recal file - "af22c55d91394c56a222fd40d6d54781"); // cut VCF + "5d483fe1ba2ef36ee9e6c14cbd654706"); // cut VCF @DataProvider(name = "VRIndelTest") public Object[][] createTestVariantRecalibratorIndel() { @@ -193,7 +193,7 @@ public class VariantRecalibrationWalkersIntegrationTest extends WalkerTest { " -o %s" + " -tranchesFile " + privateTestDir + "VQSR.mixedTest.tranches" + " -recalFile " + privateTestDir + "VQSR.mixedTest.recal", - Arrays.asList("ec519e1f01459813dab57aefffc019e2")); + Arrays.asList("018b3a5cc7cf0cb5468c6a0c80ccaa8b")); executeTest("testApplyRecalibrationSnpAndIndelTogether", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java index e14580ead..a8309c14e 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/LiftoverVariantsIntegrationTest.java @@ -61,4 +61,13 @@ public class LiftoverVariantsIntegrationTest extends WalkerTest { Arrays.asList("7e7bad0e1890753a01303c09a38ceb8d")); executeTest("test hg18 to hg19, unsorted", spec); } + + @Test + public void testLiftoverFilteringOfIndels() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T FilterLiftedVariants -o %s -R " + b37KGReference + " --variant:vcf " + privateTestDir + "liftover_indel_test.vcf --no_cmdline_in_header", + 1, + Arrays.asList("0909a953291a5e701194668c9b8833ab")); + executeTest("test liftover filtering of indels", spec); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java index bde597fbe..a1d673b56 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java @@ -20,7 +20,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, - Arrays.asList("d88bdae45ae0e74e8d8fd196627e612c") + Arrays.asList("954415f84996d27b07d00855e96d33a2") ); spec.disableShadowBCF(); @@ -49,7 +49,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { + b37hapmapGenotypes + " -disc " + testFile + " -o %s --no_cmdline_in_header -U LENIENT_VCF_PROCESSING", 1, - Arrays.asList("c0b937edb6a8b6392d477511d4f1ebcf") + Arrays.asList("ca1b5226eaeaffb78d4abd9d2ee10c43") ); spec.disableShadowBCF(); @@ -70,6 +70,20 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testComplexSelection--" + testfile, spec); } + @Test + public void testComplexSelectionWithNonExistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString(" --ALLOW_NONOVERLAPPING_COMMAND_LINE_SAMPLES -sn A -se '[CDH]' -sn Z -sn T -sf " + samplesFile + " -env -ef -select 'DP < 250' --variant " + testfile), + 1, + Arrays.asList("4386fbb258dcef4437495a37f5a83c53") + ); + spec.disableShadowBCF(); + executeTest("testComplexSelectionWithNonExistingSamples--" + testfile, spec); + } + @Test public void testNonExistingFieldSelection() { String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; @@ -98,6 +112,21 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testSampleExclusion--" + testfile, spec); } + @Test + public void testSampleInclusionWithNonexistingSamples() { + String testfile = validationDataLocation + "test.filtered.maf_annotated.vcf"; + String samplesFile = validationDataLocation + "SelectVariants.samples.txt"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -L 1:1-1000000 -o %s --no_cmdline_in_header -sn A -sn Z -sn Q -sf " + samplesFile + " --variant " + testfile, + 1, + UserException.BadInput.class + ); + spec.disableShadowBCF(); + + executeTest("testSampleInclusionWithNonexistingSamples--" + testfile, spec); + } + @Test public void testConcordance() { @@ -128,6 +157,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest { executeTest("testVariantTypeSelection--" + testFile, spec); } + @Test + public void testIndelLengthSelection() { + String testFile = privateTestDir + "complexExample1.vcf"; + + WalkerTestSpec spec = new WalkerTestSpec( + "-T SelectVariants -R " + b36KGReference + " -selectType INDEL --variant " + testFile + " -o %s --no_cmdline_in_header --maxIndelSize 3", + 1, + Arrays.asList("004589868ca5dc887e2dff876b4cc797") + ); + + executeTest("testIndelLengthSelection--" + testFile, spec); + } + @Test public void testUsingDbsnpName() { String testFile = privateTestDir + "combine.3.vcf"; @@ -148,7 +190,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("52cb2f150559ca1457e9df7ec153dbb4") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRegenotype--" + testFile, spec); @@ -174,7 +216,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b36KGReference + " -regenotype -sn NA12892 --variant " + testFile + " -o %s --no_cmdline_in_header", 1, - Arrays.asList("52cb2f150559ca1457e9df7ec153dbb4") + Arrays.asList("46ff472fc7ef6734ad01170028d5924a") ); executeTest("testRemoveMLEAndRegenotype--" + testFile, spec); @@ -213,7 +255,7 @@ public class SelectVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T SelectVariants -R " + b37KGReference + " -o %s --no_cmdline_in_header -sf " + samplesFile + " --excludeNonVariants --variant " + testfile, 1, - Arrays.asList("3ab35d5e81a29fb5db3e2add11c7e823") + Arrays.asList("f14d75892b99547d8e9ba3a03bfb04ea") ); executeTest("test select from multi allelic with excludeNonVariants --" + testfile, spec); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java index 6a3d755d7..67d47997b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/ValidateVariantsIntegrationTest.java @@ -33,6 +33,8 @@ import java.util.Arrays; public class ValidateVariantsIntegrationTest extends WalkerTest { + protected static final String emptyMd5 = "d41d8cd98f00b204e9800998ecf8427e"; + public static String baseTestString(String file, String type) { return "-T ValidateVariants -R " + b36KGReference + " -L 1:10001292-10001303 --variant:vcf " + privateTestDir + file + " --validationType " + type; } @@ -42,7 +44,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleGood.vcf", "ALL"), 0, - Arrays.asList("d41d8cd98f00b204e9800998ecf8427e") + Arrays.asList(emptyMd5) ); executeTest("test good file", spec); @@ -53,7 +55,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad.vcf", "REF"), 0, - UserException.MalformedFile.class + UserException.FailsStrictValidation.class ); executeTest("test bad ref base #1", spec); @@ -64,7 +66,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad2.vcf", "REF"), 0, - UserException.MalformedFile.class + UserException.FailsStrictValidation.class ); executeTest("test bad ref base #2", spec); @@ -75,7 +77,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad.vcf", "CHR_COUNTS"), 0, - UserException.MalformedFile.class + UserException.FailsStrictValidation.class ); executeTest("test bad chr counts #1", spec); @@ -86,7 +88,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad2.vcf", "CHR_COUNTS"), 0, - UserException.MalformedFile.class + UserException.FailsStrictValidation.class ); executeTest("test bad chr counts #2", spec); @@ -97,7 +99,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad.vcf", "IDS") + " --dbsnp " + b36dbSNP129, 0, - UserException.MalformedFile.class + UserException.FailsStrictValidation.class ); executeTest("test bad RS ID", spec); @@ -108,7 +110,7 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad.vcf", "ALLELES"), 0, - UserException.MalformedFile.class + UserException.FailsStrictValidation.class ); executeTest("test bad alt allele", spec); @@ -119,18 +121,29 @@ public class ValidateVariantsIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("validationExampleBad3.vcf", "REF"), 0, - UserException.MalformedFile.class + UserException.FailsStrictValidation.class ); executeTest("test bad ref allele in deletion", spec); } + @Test + public void testNoValidation() { + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("validationExampleBad.vcf", "NONE"), + 0, + Arrays.asList(emptyMd5) + ); + + executeTest("test no validation", spec); + } + @Test public void testComplexEvents() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString("complexEvents.vcf", "ALL"), 0, - Arrays.asList("d41d8cd98f00b204e9800998ecf8427e") + Arrays.asList(emptyMd5) ); executeTest("test validating complex events", spec); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java index 3e59508bc..8f11c09f6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToBinaryPedIntegrationTest.java @@ -28,6 +28,13 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { } + public static String baseTestString(String inputVCF, String inputMetaData, int gq, String mode) { + return "-T VariantsToBinaryPed -R " + b37KGReference + " -mode "+mode + + " -V " + VTBP_DATA_DIR+inputVCF + " -m "+VTBP_DATA_DIR+inputMetaData + String.format(" -mgq %d",gq) + + " -bim %s -fam %s -bed %s"; + + } + @Test public void testNA12878Alone() { String testName = "testNA12878Alone"; @@ -52,6 +59,18 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { executeTest(testName, spec); } + @Test + public void testNA12878AloneSNPMajor() { + String testName = "testNA12878AloneSNPMajor"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("NA12878.subset.vcf", "CEUTrio.NA12878.metadata.txt",10,"SNP_MAJOR"), + 3, + Arrays.asList("411ef932095728bfa5e509c2c0e4cfa8","7251ca4e8a515b698e7e7d25cff91978","ada1acc475d096012b921b3219c3a446") + ); + + executeTest(testName, spec); + } + @Test public void testNA12878HighGQ() { String testName = "testNA12878HighGQ"; @@ -86,6 +105,16 @@ public class VariantsToBinaryPedIntegrationTest extends WalkerTest { ); } + @Test + public void test1000GWithIndelsSNPMajor() { + String testName = "test1000GWithIndelsSNPMajor"; + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString("1000G_selected_allVariants.vcf", "1000G_selected_allVariants.md.txt",0,"SNP_MAJOR"), + 3, + Arrays.asList("3c98112434d9948dc47da72ad14e8d84","4a0ba3d0594b06306aa6459e4e28ec9a","451498ceff06c1649890900fa994f1af") + ); + } + @Test public void test1000G_Symbolic() { String testName = "test1000G_Symbolic"; diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java index 2ffcd02e2..8186ffc7d 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTableIntegrationTest.java @@ -63,7 +63,7 @@ public class VariantsToTableIntegrationTest extends WalkerTest { @Test(enabled = true) public void testMultiAllelicOneRecord() { WalkerTestSpec spec = new WalkerTestSpec(variantsToTableMultiAllelicCmd(""), - Arrays.asList("13dd36c08be6c800f23988e6000d963e")); + Arrays.asList("0ff49c08690f61a38614606a090f23ea")); executeTest("testMultiAllelicOneRecord", spec); } @@ -100,6 +100,19 @@ public class VariantsToTableIntegrationTest extends WalkerTest { executeTest("testGenotypeFieldsWithInline", spec); } + @Test(enabled = true) + public void testListFields() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + "-R " + b36KGReference + + " --variant " + privateTestDir + "vcfexample.withMLE.vcf" + + " -T VariantsToTable" + + " -GF PL" + + " -o %s", + 1, + Arrays.asList("1cb2737ab0eaee0a9ae25ab2e7ac3e7e")); + executeTest("testGenotypeFields", spec); + } + @Test(enabled = true) public void testMoltenOutput() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java index 49778a4d8..122e0265f 100644 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocUnitTest.java @@ -16,6 +16,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import java.io.File; import java.io.FileNotFoundException; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -211,4 +212,59 @@ public class GenomeLocUnitTest extends BaseTest { Assert.assertEquals(cfg.gl1.reciprocialOverlapFraction(cfg.gl2), cfg.overlapFraction); } } + + // ------------------------------------------------------------------------------------- + // + // testing comparison, hashcode, and equals + // + // ------------------------------------------------------------------------------------- + + @DataProvider(name = "GenomeLocComparisons") + public Object[][] createGenomeLocComparisons() { + List tests = new ArrayList(); + + final int start = 10; + for ( int stop = start; stop < start + 3; stop++ ) { + final GenomeLoc g1 = genomeLocParser.createGenomeLoc("chr2", start, stop); + for ( final String contig : Arrays.asList("chr1", "chr2", "chr3")) { + for ( int start2 = start - 1; start2 <= stop + 1; start2++ ) { + for ( int stop2 = start2; stop2 < stop + 2; stop2++ ) { + final GenomeLoc g2 = genomeLocParser.createGenomeLoc(contig, start2, stop2); + + ComparisonResult cmp = ComparisonResult.EQUALS; + if ( contig.equals("chr3") ) cmp = ComparisonResult.LESS_THAN; + else if ( contig.equals("chr1") ) cmp = ComparisonResult.GREATER_THAN; + else if ( start < start2 ) cmp = ComparisonResult.LESS_THAN; + else if ( start > start2 ) cmp = ComparisonResult.GREATER_THAN; + else if ( stop < stop2 ) cmp = ComparisonResult.LESS_THAN; + else if ( stop > stop2 ) cmp = ComparisonResult.GREATER_THAN; + + tests.add(new Object[]{g1, g2, cmp}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private enum ComparisonResult { + LESS_THAN(-1), + EQUALS(0), + GREATER_THAN(1); + + final int cmp; + + private ComparisonResult(int cmp) { + this.cmp = cmp; + } + } + + @Test(dataProvider = "GenomeLocComparisons") + public void testGenomeLocComparisons(GenomeLoc g1, GenomeLoc g2, ComparisonResult expected) { + Assert.assertEquals(g1.compareTo(g2), expected.cmp, "Comparing genome locs failed"); + Assert.assertEquals(g1.equals(g2), expected == ComparisonResult.EQUALS); + if ( expected == ComparisonResult.EQUALS ) + Assert.assertEquals(g1.hashCode(), g2.hashCode(), "Equal genome locs don't have the same hash code"); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java similarity index 99% rename from public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java index 0f19e2f90..5b052454a 100644 --- a/public/java/test/org/broadinstitute/sting/utils/ReservoirDownsamplerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/LegacyReservoirDownsamplerUnitTest.java @@ -17,7 +17,7 @@ import java.util.*; * @author mhanna * @version 0.1 */ -public class ReservoirDownsamplerUnitTest { +public class LegacyReservoirDownsamplerUnitTest { private static final SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(1,1,200); diff --git a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java index 6a01bb0b4..edd1bc356 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MWUnitTest.java @@ -40,12 +40,15 @@ public class MWUnitTest extends BaseTest { Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu.getObservations(),MannWhitneyU.USet.SET2),11L); MannWhitneyU mwu2 = new MannWhitneyU(); + MannWhitneyU mwuNoDither = new MannWhitneyU(false); for ( int dp : new int[]{2,4,5,6,8} ) { mwu2.add(dp,MannWhitneyU.USet.SET1); + mwuNoDither.add(dp,MannWhitneyU.USet.SET1); } for ( int dp : new int[]{1,3,7,9,10,11,12,13} ) { mwu2.add(dp,MannWhitneyU.USet.SET2); + mwuNoDither.add(dp,MannWhitneyU.USet.SET2); } MannWhitneyU.ExactMode pm = MannWhitneyU.ExactMode.POINT; @@ -54,6 +57,8 @@ public class MWUnitTest extends BaseTest { // tests using the hypothesis that set 2 dominates set 1 (U value = 10) Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET1),10L); Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwu2.getObservations(),MannWhitneyU.USet.SET2),30L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET1),10L); + Assert.assertEquals(MannWhitneyU.calculateOneSidedU(mwuNoDither.getObservations(),MannWhitneyU.USet.SET2),30L); Pair sizes = mwu2.getSetSizes(); diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 04b0199d8..fc2b2638b 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -225,65 +225,67 @@ public class MathUtilsUnitTest extends BaseTest { @Test public void testApproximateLog10SumLog10() { + + final double requiredPrecision = 1E-4; - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-5.15}), -5.15, 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {130.0}), 130.0, 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.145}), -0.145, 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0}), 0.0, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-5.15}), -5.15, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {130.0}), 130.0, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.145}), -0.145, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, Double.NEGATIVE_INFINITY), -0.12345, 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, Double.NEGATIVE_INFINITY), -15.7654, 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, Double.NEGATIVE_INFINITY), -0.12345, requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, Double.NEGATIVE_INFINITY), -15.7654, requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0, -2.5), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5, -1.1), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1, 0.5), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2, 1.3), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2, 18.1), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2, 26.6), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1, -45.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); - Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0, -2.5), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5, -1.1), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1, 0.5), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2, 1.3), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2, 18.1), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2, 26.6), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1, -45.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6, -26.2), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456, -0.34567), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101, -17.9341), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); } @Test @@ -299,14 +301,47 @@ public class MathUtilsUnitTest extends BaseTest { @Test public void testLog10sumLog10() { + final double requiredPrecision = 1E-14; + final double log3 = 0.477121254719662; - Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3), 0); - Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3), 0); - Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3), 0); + Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}), log3, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0), log3, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[]{0.0, 0.0, 0.0}, 0, 3), log3, requiredPrecision); final double log2 = 0.301029995663981; - Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2), 0); - Assert.assertEquals(MathUtils.compareDoubles(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0), 0); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 2), log2, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}, 0, 1), 0.0, requiredPrecision); + + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0}), 0.0, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-5.15}), -5.15, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {130.0}), 130.0, requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.145}), -0.145, requiredPrecision); + + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), requiredPrecision); + + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), requiredPrecision); + Assert.assertEquals(MathUtils.log10sumLog10(new double[] {-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), requiredPrecision); } @Test diff --git a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java index 7a2696b7b..7285c00ac 100755 --- a/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/SimpleTimerUnitTest.java @@ -1,12 +1,12 @@ package org.broadinstitute.sting.utils; import org.broadinstitute.sting.BaseTest; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; -import java.io.File; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; public class SimpleTimerUnitTest extends BaseTest { private final static String NAME = "unit.test.timer"; @@ -17,33 +17,88 @@ public class SimpleTimerUnitTest extends BaseTest { Assert.assertEquals(t.getName(), NAME, "Name is not the provided one"); Assert.assertFalse(t.isRunning(), "Initial state of the timer is running"); Assert.assertEquals(t.getElapsedTime(), 0.0, "New timer elapsed time should be 0"); + Assert.assertEquals(t.getElapsedTimeNano(), 0l, "New timer elapsed time nano should be 0"); t.start(); Assert.assertTrue(t.isRunning(), "Started timer isn't running"); Assert.assertTrue(t.getElapsedTime() >= 0.0, "Elapsed time should be >= 0"); + Assert.assertTrue(t.getElapsedTimeNano() >= 0.0, "Elapsed time nano should be >= 0"); + long n1 = t.getElapsedTimeNano(); double t1 = t.getElapsedTime(); idleLoop(); // idle loop to wait a tiny bit of time + long n2 = t.getElapsedTimeNano(); double t2 = t.getElapsedTime(); Assert.assertTrue(t2 >= t1, "T2 >= T1 for a running time"); + Assert.assertTrue(n2 >= n1, "T2 >= T1 nano for a running time"); t.stop(); Assert.assertFalse(t.isRunning(), "Stopped timer still running"); + long n3 = t.getElapsedTimeNano(); double t3 = t.getElapsedTime(); idleLoop(); // idle loop to wait a tiny bit of time double t4 = t.getElapsedTime(); + long n4 = t.getElapsedTimeNano(); Assert.assertTrue(t4 == t3, "Elapsed times for two calls of stop timer not the same"); + Assert.assertTrue(n4 == n3, "Elapsed times for two calls of stop timer not the same"); t.restart(); idleLoop(); // idle loop to wait a tiny bit of time double t5 = t.getElapsedTime(); + long n5 = t.getElapsedTimeNano(); Assert.assertTrue(t.isRunning(), "Restarted timer should be running"); idleLoop(); // idle loop to wait a tiny bit of time double t6 = t.getElapsedTime(); + long n6 = t.getElapsedTimeNano(); Assert.assertTrue(t5 >= t4, "Restarted timer elapsed time should be after elapsed time preceding the restart"); Assert.assertTrue(t6 >= t5, "Second elapsed time not after the first in restarted timer"); + Assert.assertTrue(n5 >= n4, "Restarted timer elapsed time nano should be after elapsed time preceding the restart"); + Assert.assertTrue(n6 >= n5, "Second elapsed time nano not after the first in restarted timer"); + + final List secondTimes = Arrays.asList(t1, t2, t3, t4, t5, t6); + final List nanoTimes = Arrays.asList(n1, n2, n3, n4, n5, n6); + for ( int i = 0; i < nanoTimes.size(); i++ ) + Assert.assertEquals( + SimpleTimer.nanoToSecondsAsDouble(nanoTimes.get(i)), + secondTimes.get(i), 1e-1, "Nanosecond and second timer disagree"); } - private final static void idleLoop() { + @Test + public void testNanoResolution() { + SimpleTimer t = new SimpleTimer(NAME); + + // test the nanosecond resolution + long n7 = t.currentTimeNano(); + int sum = 0; + for ( int i = 0; i < 100; i++) sum += i; + long n8 = t.currentTimeNano(); + final long delta = n8 - n7; + final long oneMilliInNano = TimeUnit.MILLISECONDS.toNanos(1); + logger.warn("nanoTime before nano operation " + n7); + logger.warn("nanoTime after nano operation of summing 100 ints " + n8 + ", sum = " + sum + " time delta " + delta + " vs. 1 millsecond in nano " + oneMilliInNano); + Assert.assertTrue(n8 > n7, "SimpleTimer doesn't appear to have nanoSecond resolution: n8 " + n8 + " <= n7 " + n7); + Assert.assertTrue(delta < oneMilliInNano, + "SimpleTimer doesn't appear to have nanoSecond resolution: time delta is " + delta + " vs 1 millisecond in nano " + oneMilliInNano); + } + + @Test + public void testMeaningfulTimes() { + SimpleTimer t = new SimpleTimer(NAME); + + t.start(); + for ( int i = 0; i < 100; i++ ) ; + long nano = t.getElapsedTimeNano(); + double secs = t.getElapsedTime(); + + Assert.assertTrue(secs > 0, "Seconds timer doesn't appear to count properly: elapsed time is " + secs); + Assert.assertTrue(secs < 0.01, "Fast operation said to take longer than 10 milliseconds: elapsed time in seconds " + secs); + + Assert.assertTrue(nano > 0, "Nanosecond timer doesn't appear to count properly: elapsed time is " + nano); + final long maxTimeInMicro = 100; + final long maxTimeInNano = TimeUnit.MICROSECONDS.toNanos(100); + Assert.assertTrue(nano < maxTimeInNano, "Fast operation said to take longer than " + maxTimeInMicro + " microseconds: elapsed time in nano " + nano + " micro " + TimeUnit.NANOSECONDS.toMicros(nano)); + } + + private static void idleLoop() { for ( int i = 0; i < 100000; i++ ) ; // idle loop to wait a tiny bit of time } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index c05b11cf7..736162300 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -5,21 +5,24 @@ package org.broadinstitute.sting.utils.fasta; // the imports for unit testing. -import org.broadinstitute.sting.utils.exceptions.UserException; -import org.testng.Assert; -import org.testng.annotations.Test; -import org.testng.annotations.DataProvider; -import org.broadinstitute.sting.BaseTest; - -import java.io.File; -import java.io.FileNotFoundException; -import java.util.Arrays; -import java.util.List; -import java.util.ArrayList; - import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMSequenceRecord; +import org.apache.log4j.Priority; +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; /** * Basic unit test for GenomeLoc @@ -30,7 +33,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { //private static final List QUERY_SIZES = Arrays.asList(1); private static final List QUERY_SIZES = Arrays.asList(1, 10, 100); - private static final List CACHE_SIZES = Arrays.asList(-1, 1000); + private static final List CACHE_SIZES = Arrays.asList(-1, 100, 1000); @DataProvider(name = "fastas") public Object[][] createData1() { @@ -46,20 +49,24 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { return params.toArray(new Object[][]{}); } - @Test(dataProvider = "fastas", enabled = true) - public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) { - IndexedFastaSequenceFile caching, uncached; - try { - caching = cacheSize == -1 ? new CachingIndexedFastaSequenceFile(fasta) : new CachingIndexedFastaSequenceFile(fasta, cacheSize); - uncached = new IndexedFastaSequenceFile(fasta); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(fasta,ex); - } + private static long getCacheSize(final long cacheSizeRequested) { + return cacheSizeRequested == -1 ? CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE : cacheSizeRequested; + } - SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); + @Test(dataProvider = "fastas", enabled = true) + public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException { + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + + SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0); logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", contig.getSequenceName(), contig.getSequenceLength(), cacheSize, querySize)); + testSequential(caching, fasta, querySize); + } + + private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException { + final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + + SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); for ( int i = 0; i < contig.getSequenceLength(); i += STEP_SIZE ) { int start = i; int stop = start + querySize; @@ -72,19 +79,23 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { Assert.assertEquals(cachedVal.getBases(), uncachedVal.getBases()); } } + + // asserts for efficiency. We are going to make contig.length / STEP_SIZE queries + // at each of range: start -> start + querySize against a cache with size of X. + // we expect to hit the cache each time range falls within X. We expect a hit + // on the cache if range is within X. Which should happen at least (X - query_size * 2) / STEP_SIZE + // times. + final int minExpectedHits = (int)Math.floor((Math.min(caching.getCacheSize(), contig.getSequenceLength()) - querySize * 2.0) / STEP_SIZE); + caching.printEfficiency(Priority.WARN); + Assert.assertTrue(caching.getCacheHits() >= minExpectedHits, "Expected at least " + minExpectedHits + " cache hits but only got " + caching.getCacheHits()); + } // Tests grabbing sequences around a middle cached value. @Test(dataProvider = "fastas", enabled = true) - public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) { - IndexedFastaSequenceFile caching, uncached; - try { - uncached = new IndexedFastaSequenceFile(fasta); - caching = new CachingIndexedFastaSequenceFile(fasta, cacheSize); - } - catch(FileNotFoundException ex) { - throw new UserException.CouldNotReadInputFile(fasta,ex); - } + public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { + final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -108,4 +119,48 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } } } + + @DataProvider(name = "ParallelFastaTest") + public Object[][] createParallelFastaTest() { + List params = new ArrayList(); +// for ( int nt : Arrays.asList(1, 2, 3) ) { +// for ( int cacheSize : CACHE_SIZES ) { +// params.add(new Object[]{simpleFasta, cacheSize, 10, nt}); +// } +// } + + for ( File fasta : Arrays.asList(simpleFasta) ) { + for ( int cacheSize : CACHE_SIZES ) { + for ( int querySize : QUERY_SIZES ) { + for ( int nt : Arrays.asList(1, 2, 3, 4) ) { + params.add(new Object[]{fasta, cacheSize, querySize, nt}); + } + } + } + } + + return params.toArray(new Object[][]{}); + } + + + @Test(dataProvider = "ParallelFastaTest", enabled = true, timeOut = 60000) + public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException { + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + + logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt)); + for ( int iterations = 0; iterations < 1; iterations++ ) { + final ExecutorService executor = Executors.newFixedThreadPool(nt); + final Collection> tasks = new ArrayList>(nt); + for ( int i = 0; i < nt; i++ ) + tasks.add(new Callable() { + @Override + public Object call() throws Exception { + testSequential(caching, fasta, querySize); + return null; + } + }); + executor.invokeAll(tasks); + executor.shutdownNow(); + } + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java new file mode 100644 index 000000000..6c59f1585 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/InputProducerUnitTest.java @@ -0,0 +1,147 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.LinkedBlockingDeque; +import java.util.concurrent.Semaphore; + +/** +* UnitTests for the InputProducer +* +* User: depristo +* Date: 8/24/12 +* Time: 11:25 AM +* To change this template use File | Settings | File Templates. +*/ +public class InputProducerUnitTest extends BaseTest { + @DataProvider(name = "InputProducerTest") + public Object[][] createInputProducerTest() { + List tests = new ArrayList(); + + for ( final int nElements : Arrays.asList(0, 1, 10, 100, 1000, 10000, 100000) ) { + for ( final int queueSize : Arrays.asList(1, 10, 100) ) { + tests.add(new Object[]{ nElements, queueSize }); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "InputProducerTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testInputProducer(final int nElements, final int queueSize) throws InterruptedException { + final List elements = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) elements.add(i); + + final LinkedBlockingDeque.InputValue> readQueue = + new LinkedBlockingDeque.InputValue>(queueSize); + + final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), new SimpleTimer(), readQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + + Assert.assertFalse(ip.allInputsHaveBeenRead(), "InputProvider said that all inputs have been read, but I haven't started reading yet"); + Assert.assertEquals(ip.getNumInputValues(), -1, "InputProvider told me that the queue was done, but I haven't started reading yet"); + + es.submit(ip); + + int lastValue = -1; + int nRead = 0; + while ( true ) { + final int nTotalElements = ip.getNumInputValues(); + final int observedQueueSize = readQueue.size(); + Assert.assertTrue(observedQueueSize <= queueSize, + "Reader is enqueuing more elements " + observedQueueSize + " than allowed " + queueSize); + + if ( nRead + observedQueueSize < nElements ) + Assert.assertEquals(nTotalElements, -1, "getNumInputValues should have returned -1 with not all elements read"); + // note, cannot test else case because elements input could have emptied between calls + + final InputProducer.InputValue value = readQueue.take(); + if ( value.isEOFMarker() ) { + Assert.assertEquals(nRead, nElements, "Number of input values " + nRead + " not all that are expected " + nElements); + Assert.assertEquals(readQueue.size(), 0, "Last queue element found but queue contains more values!"); + break; + } else { + Assert.assertTrue(lastValue < value.getValue(), "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)value.getValue(), expected, "Value observed " + value.getValue() + " not equal to the expected value " + expected); + nRead++; + lastValue = value.getValue(); + } + } + + Assert.assertTrue(ip.allInputsHaveBeenRead(), "InputProvider said that all inputs haven't been read, but I read them all"); + Assert.assertEquals(ip.getNumInputValues(), nElements, "Wrong number of total elements getNumInputValues"); + es.shutdownNow(); + } + + @Test(enabled = true, dataProvider = "InputProducerTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testInputProducerLocking(final int nElements, final int queueSize) throws InterruptedException { + final List elements = new ArrayList(nElements); + for ( int i = 0; i < nElements; i++ ) elements.add(i); + + final LinkedBlockingDeque.InputValue> readQueue = + new LinkedBlockingDeque.InputValue>(); + + final InputProducer ip = new InputProducer(elements.iterator(), new MultiThreadedErrorTracker(), new SimpleTimer(), readQueue); + + final ExecutorService es = Executors.newSingleThreadExecutor(); + es.submit(ip); + + ip.waitForDone(); + + Assert.assertEquals(ip.getNumInputValues(), nElements, "InputProvider told me that the queue was done, but I haven't started reading yet"); + Assert.assertEquals(readQueue.size(), nElements + 1, "readQueue should have had all elements read into it"); + } + + final static class BlockingIterator implements Iterator { + final Semaphore blockNext = new Semaphore(0); + final Semaphore blockOnNext = new Semaphore(0); + final Iterator underlyingIterator; + + BlockingIterator(Iterator underlyingIterator) { + this.underlyingIterator = underlyingIterator; + } + + public void allowNext() { + blockNext.release(1); + } + + public void blockTillNext() throws InterruptedException { + blockOnNext.acquire(1); + } + + @Override + public boolean hasNext() { + return underlyingIterator.hasNext(); + } + + @Override + public T next() { + try { + blockNext.acquire(1); + T value = underlyingIterator.next(); + blockOnNext.release(1); + return value; + } catch (InterruptedException ex) { + throw new RuntimeException(ex); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException("x"); + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java new file mode 100755 index 000000000..80b0b4ee2 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerIntegrationTest.java @@ -0,0 +1,53 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + +public class NanoSchedulerIntegrationTest extends WalkerTest { + @DataProvider(name = "NanoSchedulerUGTest") + public Object[][] createNanoSchedulerUGTest() { + List tests = new ArrayList(); + + for ( final int nt : Arrays.asList(1, 2) ) + for ( final int nct : Arrays.asList(1, 2) ) { +// tests.add(new Object[]{ "SNP", "a1c7546f32a8919a3f3a70a04b2e8322", nt, nct }); +//// tests.add(new Object[]{ "INDEL", "0a6d2be79f4f8a4b0eb788cc4751b31b", nt, nct }); + tests.add(new Object[]{ "BOTH", "85fc5d6dfeb60ed89763470f4b4c981e", nt, nct }); + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, dataProvider = "NanoSchedulerUGTest") + private void testNanoSchedulerUGTest(final String glm, final String md5, final int nt, final int nct ) { + WalkerTestSpec spec = new WalkerTestSpec( + buildCommandLine( + "-T UnifiedGenotyper -R " + b37KGReference, + "--no_cmdline_in_header -G", + //"--dbsnp " + b37dbSNP132, + "-I " + privateTestDir + "NA12878.HiSeq.b37.chr20.10_11mb.bam", + "-L 20:10,000,000-10,100,000", + "-glm " + glm, + "--contamination_fraction_to_filter 0.0", + "-nt " + nt, + "-nct " + nct, + "-o %s" + ), + 1, + Arrays.asList(md5) + ); + executeTest(String.format("testUG-glm:%s-nt%d-nct%d", glm, nt, nct), spec); + } + + + +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java new file mode 100644 index 000000000..af2e18ad9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/NanoSchedulerUnitTest.java @@ -0,0 +1,318 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.apache.log4j.BasicConfigurator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +/** + * UnitTests for the NanoScheduler + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class NanoSchedulerUnitTest extends BaseTest { + private final static boolean DEBUG = false; + private final static boolean debug = false; + public static final int NANO_SCHEDULE_MAX_RUNTIME = 30000; + + private static class Map2x implements NSMapFunction { + @Override public Integer apply(Integer input) { return input * 2; } + } + + private static void maybeDelayMe(final int input) { + try { + if ( input % 7 == 0 ) { + final int milliToSleep = (input % 10); + //System.out.printf("Sleeping %d millseconds%n", milliToSleep); + Thread.sleep(milliToSleep); + } + } catch ( InterruptedException ex ) { + throw new RuntimeException(ex); + } + } + + private static class Map2xWithDelays extends Map2x { + @Override public Integer apply(Integer input) { + maybeDelayMe(input); + return input * 2; + } + } + + private static class ReduceSum implements NSReduceFunction { + int prevOne = Integer.MIN_VALUE; + + @Override public Integer apply(Integer one, Integer sum) { + Assert.assertTrue(prevOne < one, "Reduce came in out of order. Prev " + prevOne + " cur " + one); + return one + sum; + } + } + + private static class ProgressCallback implements NSProgressFunction { + int callBacks = 0; + + @Override + public void progress(Integer lastMapInput) { + callBacks++; + } + } + + + private static int sum2x(final int start, final int end) { + int sum = 0; + for ( int i = start; i < end; i++ ) + sum += 2 * i; + return sum; + } + + private static class NanoSchedulerBasicTest extends TestDataProvider { + final int bufferSize, nThreads, start, end, expectedResult; + final boolean addDelays; + + public NanoSchedulerBasicTest(final int bufferSize, final int nThreads, final int start, final int end, final boolean addDelays) { + super(NanoSchedulerBasicTest.class); + this.bufferSize = bufferSize; + this.nThreads = nThreads; + this.start = start; + this.end = end; + this.expectedResult = sum2x(start, end); + this.addDelays = addDelays; + setName(String.format("%s nt=%d buf=%d start=%d end=%d sum=%d delays=%b", + getClass().getSimpleName(), nThreads, bufferSize, start, end, expectedResult, addDelays)); + } + + public Iterator makeReader() { + final List ints = new ArrayList(); + for ( int i = start; i < end; i++ ) + ints.add(i); + return ints.iterator(); + } + + public int nExpectedCallbacks() { + int nElements = Math.max(end - start, 0); + return nElements / bufferSize; + } + + public Map2x makeMap() { return addDelays ? new Map2xWithDelays() : new Map2x(); } + public Integer initReduce() { return 0; } + public ReduceSum makeReduce() { return new ReduceSum(); } + + public NanoScheduler makeScheduler() { + final NanoScheduler nano; + if ( bufferSize == -1 ) + nano = new NanoScheduler(nThreads); + else + nano = new NanoScheduler(bufferSize, nThreads); + + nano.setDebug(debug); + return nano; + } + } + + static NanoSchedulerBasicTest exampleTest = null; + static NanoSchedulerBasicTest exampleTestWithDelays = null; + + @BeforeSuite + public void setUp() throws Exception { + exampleTest = new NanoSchedulerBasicTest(10, 2, 1, 10, false); + exampleTestWithDelays = new NanoSchedulerBasicTest(10, 2, 1, 10, true); + } + + @DataProvider(name = "NanoSchedulerBasicTest") + public Object[][] createNanoSchedulerBasicTest() { +// for ( final int bufferSize : Arrays.asList(1, 10) ) { +// for ( final int nt : Arrays.asList(1, 2, 4) ) { +// for ( final int start : Arrays.asList(0) ) { +// for ( final int end : Arrays.asList(0, 1, 2) ) { +// exampleTest = new NanoSchedulerBasicTest(bufferSize, nt, start, end, false); +// } +// } +// } +// } + + for ( final int bufferSize : Arrays.asList(-1, 1, 10, 100) ) { + for ( final int nt : Arrays.asList(1, 2, 4) ) { + for ( final int start : Arrays.asList(0) ) { + for ( final int end : Arrays.asList(0, 1, 2, 11, 100, 10000, 100000) ) { + for ( final boolean addDelays : Arrays.asList(true, false) ) { + if ( end < 1000 ) + new NanoSchedulerBasicTest(bufferSize, nt, start, end, addDelays); + } + } + } + } + } + + return NanoSchedulerBasicTest.getTests(NanoSchedulerBasicTest.class); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + public void testSingleThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { + logger.warn("Running " + test); + if ( test.nThreads == 1 ) + testNanoScheduler(test); + } + + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", timeOut = NANO_SCHEDULE_MAX_RUNTIME, dependsOnMethods = "testSingleThreadedNanoScheduler") + public void testMultiThreadedNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { + logger.warn("Running " + test); + if ( test.nThreads >= 1 ) + testNanoScheduler(test); + } + + private void testNanoScheduler(final NanoSchedulerBasicTest test) throws InterruptedException { + final SimpleTimer timer = new SimpleTimer().start(); + final NanoScheduler nanoScheduler = test.makeScheduler(); + + final ProgressCallback callback = new ProgressCallback(); + nanoScheduler.setProgressFunction(callback); + + if ( test.bufferSize > -1 ) + Assert.assertEquals(nanoScheduler.getBufferSize(), test.bufferSize, "bufferSize argument"); + Assert.assertEquals(nanoScheduler.getnThreads(), test.nThreads, "nThreads argument"); + + final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); + Assert.assertNotNull(sum); + Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + + Assert.assertTrue(callback.callBacks >= test.nExpectedCallbacks(), "Not enough callbacks detected. Expected at least " + test.nExpectedCallbacks() + " but saw only " + callback.callBacks); + nanoScheduler.shutdown(); + + // TODO -- need to enable only in the case where there's serious time spend in + // TODO -- read /map / reduce, otherwise the "outside" timer doesn't add up + final double myTimeEstimate = timer.getElapsedTime(); + final double tolerance = 0.1; + if ( false && myTimeEstimate > 0.1 ) { + Assert.assertTrue(nanoScheduler.getTotalRuntime() > myTimeEstimate * tolerance, + "NanoScheduler said that the total runtime was " + nanoScheduler.getTotalRuntime() + + " but the overall test time was " + myTimeEstimate + ", beyond our tolerance factor of " + + tolerance); + } + } + + @Test(enabled = true && ! DEBUG, dataProvider = "NanoSchedulerBasicTest", dependsOnMethods = "testMultiThreadedNanoScheduler", timeOut = NANO_SCHEDULE_MAX_RUNTIME) + public void testNanoSchedulerInLoop(final NanoSchedulerBasicTest test) throws InterruptedException { + if ( test.bufferSize > 1) { + logger.warn("Running " + test); + + final NanoScheduler nanoScheduler = test.makeScheduler(); + + // test reusing the scheduler + for ( int i = 0; i < 10; i++ ) { + final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); + Assert.assertNotNull(sum); + Assert.assertEquals((int)sum, test.expectedResult, "NanoScheduler sum not the same as calculated directly"); + } + + nanoScheduler.shutdown(); + } + } + + @Test(enabled = true && ! DEBUG, timeOut = NANO_SCHEDULE_MAX_RUNTIME) + public void testShutdown() throws InterruptedException { + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); + Assert.assertFalse(nanoScheduler.isShutdown(), "scheduler should be alive"); + nanoScheduler.shutdown(); + Assert.assertTrue(nanoScheduler.isShutdown(), "scheduler should be dead"); + } + + @Test(enabled = true && ! DEBUG, expectedExceptions = IllegalStateException.class, timeOut = NANO_SCHEDULE_MAX_RUNTIME) + public void testShutdownExecuteFailure() throws InterruptedException { + final NanoScheduler nanoScheduler = new NanoScheduler(1, 2); + nanoScheduler.shutdown(); + nanoScheduler.execute(exampleTest.makeReader(), exampleTest.makeMap(), exampleTest.initReduce(), exampleTest.makeReduce()); + } + + @DataProvider(name = "NanoSchedulerInputExceptionTest") + public Object[][] createNanoSchedulerInputExceptionTest() { + List tests = new ArrayList(); + + + for ( final int bufSize : Arrays.asList(100) ) { + for ( final int nThreads : Arrays.asList(8) ) { + for ( final boolean addDelays : Arrays.asList(true, false) ) { + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(bufSize, nThreads, 1, 1000000, false); + final int maxN = addDelays ? 10000 : 100000; + for ( int nElementsBeforeError = 0; nElementsBeforeError < maxN; nElementsBeforeError += Math.max(nElementsBeforeError / 10, 1) ) { + tests.add(new Object[]{nElementsBeforeError, test, addDelays}); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(enabled = true, expectedExceptions = NullPointerException.class, timeOut = 10000) + public void testInputErrorIsThrown_NPE() throws InterruptedException { + executeTestErrorThrowingInput(10, new NullPointerException(), exampleTest, false); + } + + @Test(enabled = true, expectedExceptions = ReviewedStingException.class, timeOut = 10000) + public void testInputErrorIsThrown_RSE() throws InterruptedException { + executeTestErrorThrowingInput(10, new ReviewedStingException("test"), exampleTest, false); + } + + @Test(enabled = true, expectedExceptions = NullPointerException.class, dataProvider = "NanoSchedulerInputExceptionTest", timeOut = 10000, invocationCount = 1) + public void testInputErrorDoesntDeadlock(final int nElementsBeforeError, final NanoSchedulerBasicTest test, final boolean addDelays ) throws InterruptedException { + executeTestErrorThrowingInput(nElementsBeforeError, new NullPointerException(), test, addDelays); + } + + private void executeTestErrorThrowingInput(final int nElementsBeforeError, final RuntimeException ex, final NanoSchedulerBasicTest test, final boolean addDelays) { + logger.warn("executeTestErrorThrowingInput " + nElementsBeforeError + " ex=" + ex + " test=" + test + " addInputDelays=" + addDelays); + final NanoScheduler nanoScheduler = test.makeScheduler(); + nanoScheduler.execute(new ErrorThrowingIterator(nElementsBeforeError, ex, addDelays), test.makeMap(), test.initReduce(), test.makeReduce()); + } + + private static class ErrorThrowingIterator implements Iterator { + final int nElementsBeforeError; + final boolean addDelays; + int i = 0; + final RuntimeException ex; + + private ErrorThrowingIterator(final int nElementsBeforeError, RuntimeException ex, boolean addDelays) { + this.nElementsBeforeError = nElementsBeforeError; + this.ex = ex; + this.addDelays = addDelays; + } + + @Override public boolean hasNext() { return true; } + @Override public Integer next() { + if ( i++ > nElementsBeforeError ) { + throw ex; + } else if ( addDelays ) { + maybeDelayMe(i); + return i; + } else { + return i; + } + } + @Override public void remove() { throw new UnsupportedOperationException("x"); } + } + + public static void main(String [ ] args) { + org.apache.log4j.Logger logger = org.apache.log4j.Logger.getRootLogger(); + BasicConfigurator.configure(); + logger.setLevel(org.apache.log4j.Level.DEBUG); + + final NanoSchedulerBasicTest test = new NanoSchedulerBasicTest(1000, Integer.valueOf(args[0]), 0, Integer.valueOf(args[1]), false); + final NanoScheduler nanoScheduler = + new NanoScheduler(test.bufferSize, test.nThreads); + nanoScheduler.setDebug(true); + + final Integer sum = nanoScheduler.execute(test.makeReader(), test.makeMap(), test.initReduce(), test.makeReduce()); + System.out.printf("Sum = %d, expected =%d%n", sum, test.expectedResult); + nanoScheduler.shutdown(); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java new file mode 100644 index 000000000..39133d1ed --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/nanoScheduler/ReducerUnitTest.java @@ -0,0 +1,200 @@ +package org.broadinstitute.sting.utils.nanoScheduler; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MultiThreadedErrorTracker; +import org.broadinstitute.sting.utils.SimpleTimer; +import org.broadinstitute.sting.utils.Utils; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.TimeUnit; + +/** + * UnitTests for Reducer + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ReducerUnitTest extends BaseTest { + @DataProvider(name = "ReducerThreadTest") + public Object[][] createReducerThreadTest() { + List tests = new ArrayList(); + + for ( final int groupSize : Arrays.asList(-1, 1, 5, 50, 500, 5000, 50000) ) { + for ( final boolean setJobIDAtStart : Arrays.asList(true, false) ) { + for ( final int nElements : Arrays.asList(0, 1, 3, 5) ) { + if ( groupSize < nElements ) { + for ( final List> jobs : Utils.makePermutations(makeJobs(nElements), nElements, false) ) { + tests.add(new Object[]{ new ListOfJobs(jobs), setJobIDAtStart, groupSize }); + } + } + } + + for ( final int nElements : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) { + if ( groupSize < nElements ) { + tests.add(new Object[]{ new ListOfJobs(makeJobs(nElements)), setJobIDAtStart, groupSize }); + } + } + } + } + + return tests.toArray(new Object[][]{}); + } + + private static class ListOfJobs extends ArrayList> { + private ListOfJobs(Collection> c) { + super(c); + } + + @Override + public String toString() { + if ( size() < 10 ) + return super.toString(); + else + return "JobList of " + size(); + } + } + + private static List> makeJobs(final int nElements) { + List> jobs = new ArrayList>(nElements); + for ( int i = 0; i < nElements; i++ ) { + jobs.add(new MapResult(i, i)); + } + return jobs; + } + + private int expectedSum(final List> jobs) { + int sum = 0; + for ( final MapResult job : jobs ) + sum += job.getValue(); + return sum; + } + + @Test(enabled = true, dataProvider = "ReducerThreadTest", timeOut = NanoSchedulerUnitTest.NANO_SCHEDULE_MAX_RUNTIME) + public void testReducerThread(final List> jobs, final boolean setJobIDAtStart, final int groupSize) throws Exception { + runTests(jobs, setJobIDAtStart, groupSize); + } + + private void runTests( final List> allJobs, boolean setJobIDAtStart, int groupSize ) throws Exception { + if ( groupSize == -1 ) + groupSize = allJobs.size(); + + final PriorityBlockingQueue> mapResultsQueue = new PriorityBlockingQueue>(); + + final List>> jobGroups = Utils.groupList(allJobs, groupSize); + final ReduceSumTest reduce = new ReduceSumTest(); + final Reducer reducer = new Reducer(reduce, new MultiThreadedErrorTracker(), new SimpleTimer(), 0); + + final TestWaitingForFinalReduce waitingThread = new TestWaitingForFinalReduce(reducer, expectedSum(allJobs)); + final ExecutorService es = Executors.newSingleThreadExecutor(); + es.submit(waitingThread); + + int nJobsSubmitted = 0; + int jobGroupCount = 0; + final int lastJobGroupCount = jobGroups.size() - 1; + setJobIDAtStart = setJobIDAtStart && groupSize == 1; + + for ( final List> jobs : jobGroups ) { + //logger.warn("Processing job group " + jobGroupCount + " with " + jobs.size() + " jobs"); + for ( final MapResult job : jobs ) { + mapResultsQueue.add(job); + nJobsSubmitted++; + } + + if ( jobGroupCount == lastJobGroupCount ) { + mapResultsQueue.add(new MapResult()); + nJobsSubmitted++; + } + + Assert.assertFalse(reducer.latchIsReleased(), "Latch should be closed at the start"); + + if ( jobGroupCount == 0 && setJobIDAtStart ) { + // only can do the setJobID if jobs cannot be submitted out of order + reducer.setTotalJobCount(allJobs.size()); + Assert.assertFalse(reducer.latchIsReleased(), "Latch should be closed even after setting last job if we haven't processed anything"); + } + + final int nReduced = reducer.reduceAsMuchAsPossible(mapResultsQueue); + Assert.assertTrue(nReduced <= nJobsSubmitted, "Somehow reduced more jobs than submitted"); + + if ( setJobIDAtStart ) { + final boolean submittedLastJob = jobGroupCount == lastJobGroupCount; + Assert.assertEquals(reducer.latchIsReleased(), submittedLastJob, + "When last job is set, latch should only be released if the last job has been submitted"); + } else { + Assert.assertEquals(reducer.latchIsReleased(), false, "When last job isn't set, latch should never be release"); + } + + jobGroupCount++; + } + + if ( setJobIDAtStart ) + Assert.assertTrue(reducer.latchIsReleased(), "Latch should be released after reducing with last job id being set"); + else { + Assert.assertFalse(reducer.latchIsReleased(), "Latch should be closed after reducing without last job id being set"); + reducer.setTotalJobCount(allJobs.size()); + Assert.assertTrue(reducer.latchIsReleased(), "Latch should be released after reducing after setting last job id "); + } + + Assert.assertEquals(reduce.nRead, allJobs.size(), "number of read values not all of the values in the reducer queue"); + es.shutdown(); + es.awaitTermination(1, TimeUnit.HOURS); + } + + @Test(expectedExceptions = IllegalStateException.class) + private void runSettingJobIDTwice() throws Exception { + final PriorityBlockingQueue> mapResultsQueue = new PriorityBlockingQueue>(); + + final Reducer reducer = new Reducer(new ReduceSumTest(), new MultiThreadedErrorTracker(), new SimpleTimer(), 0); + + reducer.setTotalJobCount(10); + reducer.setTotalJobCount(15); + } + + public class ReduceSumTest implements NSReduceFunction { + int nRead = 0; + int lastValue = -1; + + @Override public Integer apply(Integer one, Integer sum) { + Assert.assertTrue(lastValue < one, "Reduce came in out of order. Prev " + lastValue + " cur " + one); + + Assert.assertTrue(lastValue < one, "Read values coming out of order!"); + final int expected = lastValue + 1; + Assert.assertEquals((int)one, expected, "Value observed " + one + " not equal to the expected value " + expected); + nRead++; + lastValue = expected; + + return one + sum; + } + } + + final static class TestWaitingForFinalReduce implements Runnable { + final Reducer reducer; + final int expectedSum; + + TestWaitingForFinalReduce(Reducer reducer, final int expectedSum) { + this.reducer = reducer; + this.expectedSum = expectedSum; + } + + @Override + public void run() { + try { + final int observedSum = reducer.waitForFinalReduce(); + Assert.assertEquals(observedSum, expectedSum, "Reduce didn't sum to expected value"); + } catch ( InterruptedException ex ) { + Assert.fail("Got interrupted"); + } + } + } +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java index 33985e0ac..715acad03 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java @@ -100,8 +100,8 @@ public class RecalDatumUnitTest extends BaseTest { } private static void assertBasicFeaturesOfRecalDatum(final RecalDatum datum, final RecalDatumTestProvider cfg) { - Assert.assertEquals(datum.getNumMismatches(), cfg.exError); - Assert.assertEquals(datum.getNumObservations(), cfg.exTotal); + Assert.assertEquals(datum.getNumMismatches(), cfg.exError, 1E-6); + Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6); if ( cfg.getReportedQual() != -1 ) Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual()); BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalQuality(), cfg.getErrorRatePhredScaled()); diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index 485da243f..d597b9f2c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -76,8 +76,8 @@ public class RecalibrationReportUnitTest { final ReadCovariates rc = RecalUtils.computeCovariates(read, requestedCovariates); final RecalibrationTables recalibrationTables = new RecalibrationTables(requestedCovariates); - final NestedIntegerArray rgTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); - final NestedIntegerArray qualTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); + final NestedIntegerArray rgTable = recalibrationTables.getReadGroupTable(); + final NestedIntegerArray qualTable = recalibrationTables.getQualityScoreTable(); for (int offset = 0; offset < length; offset++) { diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java new file mode 100644 index 000000000..74626d031 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ArtificialSingleSampleReadStreamUnitTest.java @@ -0,0 +1,161 @@ +package org.broadinstitute.sting.utils.sam; + +import net.sf.samtools.SAMFileHeader; +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.testng.annotations.Test; +import org.testng.annotations.DataProvider; + +import org.broadinstitute.sting.BaseTest; + +public class ArtificialSingleSampleReadStreamUnitTest extends BaseTest { + + private static class ArtificialSingleSampleReadStreamTest extends TestDataProvider { + private ArtificialSingleSampleReadStream stream; + private ArtificialSingleSampleReadStreamAnalyzer streamAnalyzer; + + public ArtificialSingleSampleReadStreamTest( ArtificialSingleSampleReadStream stream ) { + super(ArtificialSingleSampleReadStreamTest.class); + + this.stream = stream; + + setName(String.format("%s: numContigs=%d stacksPerContig=%d readsPerStack=%d-%d distanceBetweenStacks=%d-%d readLength=%d-%d unmappedReads=%d", + getClass().getSimpleName(), + stream.getNumContigs(), + stream.getNumStacksPerContig(), + stream.getMinReadsPerStack(), + stream.getMaxReadsPerStack(), + stream.getMinDistanceBetweenStacks(), + stream.getMaxDistanceBetweenStacks(), + stream.getMinReadLength(), + stream.getMaxReadLength(), + stream.getNumUnmappedReads())); + } + + public void run() { + streamAnalyzer= new ArtificialSingleSampleReadStreamAnalyzer(stream); + + streamAnalyzer.analyze(stream); + + // Check whether the observed properties of the stream match its nominal properties + streamAnalyzer.validate(); + } + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamTestDataProvider") + public Object[][] createArtificialSingleSampleReadStreamTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + SAMReadGroupRecord readGroup = new SAMReadGroupRecord(readGroupID); + readGroup.setSample("testSample"); + header.addReadGroup(readGroup); + + GenomeAnalysisEngine.resetRandomGenerator(); + + // brute force testing! + for ( int numContigs = 0; numContigs <= 2; numContigs++ ) { + for ( int stacksPerContig = 0; stacksPerContig <= 2; stacksPerContig++ ) { + for ( int minReadsPerStack = 1; minReadsPerStack <= 2; minReadsPerStack++ ) { + for ( int maxReadsPerStack = 1; maxReadsPerStack <= 3; maxReadsPerStack++ ) { + for ( int minDistanceBetweenStacks = 1; minDistanceBetweenStacks <= 2; minDistanceBetweenStacks++ ) { + for ( int maxDistanceBetweenStacks = 1; maxDistanceBetweenStacks <= 3; maxDistanceBetweenStacks++ ) { + for ( int minReadLength = 1; minReadLength <= 2; minReadLength++ ) { + for ( int maxReadLength = 1; maxReadLength <= 3; maxReadLength++ ) { + for ( int numUnmappedReads = 0; numUnmappedReads <= 2; numUnmappedReads++ ) { + // Only test sane combinations here + if ( minReadsPerStack <= maxReadsPerStack && + minDistanceBetweenStacks <= maxDistanceBetweenStacks && + minReadLength <= maxReadLength && + ((numContigs > 0 && stacksPerContig > 0) || (numContigs == 0 && stacksPerContig == 0)) ) { + + new ArtificialSingleSampleReadStreamTest(new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + stacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads)); + } + } + } + } + } + } + } + } + } + } + + return ArtificialSingleSampleReadStreamTest.getTests(ArtificialSingleSampleReadStreamTest.class); + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamTestDataProvider") + public void testArtificialSingleSampleReadStream( ArtificialSingleSampleReadStreamTest test ) { + logger.warn("Running test: " + test); + + GenomeAnalysisEngine.resetRandomGenerator(); + test.run(); + } + + @DataProvider(name = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider") + public Object[][] createInvalidArgumentsTests() { + SAMFileHeader header = ArtificialSAMUtils.createArtificialSamHeader(3, 1, 10000); + String readGroupID = "testReadGroup"; + header.addReadGroup(new SAMReadGroupRecord(readGroupID)); + + return new Object[][] { + {"testNullHeader", null, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNullReadGroup", header, null, 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidReadGroup", header, "foo", 1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumContigs", header, readGroupID, -1, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidNumStacksPerContig", header, readGroupID, 1, -1, 1, 2, 1, 2, 1, 2, 0}, + {"test0ContigsNon0StacksPerContig", header, readGroupID, 0, 1, 1, 2, 1, 2, 1, 2, 0}, + {"testNon0Contigs0StacksPerContig", header, readGroupID, 1, 0, 1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMinReadsPerStack", header, readGroupID, 1, 1, -1, 2, 1, 2, 1, 2, 0}, + {"testInvalidMaxReadsPerStack", header, readGroupID, 1, 1, 1, -2, 1, 2, 1, 2, 0}, + {"testInvalidMinDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, -1, 2, 1, 2, 0}, + {"testInvalidMaxDistanceBetweenStacks", header, readGroupID, 1, 1, 1, 2, 1, -2, 1, 2, 0}, + {"testInvalidMinReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, -1, 2, 0}, + {"testInvalidMaxReadLength", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, -2, 0}, + {"testInvalidReadsPerStackRange", header, readGroupID, 1, 1, 2, 1, 1, 2, 1, 2, 0}, + {"testInvalidDistanceBetweenStacksRange", header, readGroupID, 1, 1, 1, 2, 2, 1, 1, 2, 0}, + {"testInvalidReadLengthRange", header, readGroupID, 1, 1, 1, 2, 1, 2, 2, 1, 0}, + {"testInvalidNumUnmappedReads", header, readGroupID, 1, 1, 1, 2, 1, 2, 1, 2, -1}, + }; + } + + @Test(dataProvider = "ArtificialSingleSampleReadStreamInvalidArgumentsTestDataProvider", + expectedExceptions = ReviewedStingException.class) + public void testInvalidArguments( String testName, + SAMFileHeader header, + String readGroupID, + int numContigs, + int numStacksPerContig, + int minReadsPerStack, + int maxReadsPerStack, + int minDistanceBetweenStacks, + int maxDistanceBetweenStacks, + int minReadLength, + int maxReadLength, + int numUnmappedReads ) { + + logger.warn("Running test: " + testName); + + ArtificialSingleSampleReadStream stream = new ArtificialSingleSampleReadStream(header, + readGroupID, + numContigs, + numStacksPerContig, + minReadsPerStack, + maxReadsPerStack, + minDistanceBetweenStacks, + maxDistanceBetweenStacks, + minReadLength, + maxReadLength, + numUnmappedReads); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java similarity index 65% rename from public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java index 5a606c50e..c072c808d 100755 --- a/public/java/test/org/broadinstitute/sting/utils/threading/StateMonitoringThreadFactoryUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/threading/EfficiencyMonitoringThreadFactoryUnitTest.java @@ -34,37 +34,41 @@ import org.testng.annotations.Test; import java.util.Arrays; import java.util.Collections; import java.util.List; -import java.util.concurrent.*; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; /** * Tests for the state monitoring thread factory. */ -public class StateMonitoringThreadFactoryUnitTest extends BaseTest { +public class EfficiencyMonitoringThreadFactoryUnitTest extends BaseTest { // the duration of the tests -- 100 ms is tolerable given the number of tests we are doing - private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 100; + private final static long THREAD_TARGET_DURATION_IN_MILLISECOND = 100000; + private final static int MAX_THREADS = 4; final static Object GLOBAL_LOCK = new Object(); private class StateTest extends TestDataProvider { private final double TOLERANCE = 0.1; // willing to tolerate a 10% error - final List statesForThreads; + final List statesForThreads; - public StateTest(final List statesForThreads) { + public StateTest(final List statesForThreads) { super(StateTest.class); this.statesForThreads = statesForThreads; setName("StateTest " + Utils.join(",", statesForThreads)); } - public List getStatesForThreads() { + public List getStatesForThreads() { return statesForThreads; } public int getNStates() { return statesForThreads.size(); } - public double maxStateFraction(final Thread.State state) { return fraction(state) + TOLERANCE; } - public double minStateFraction(final Thread.State state) { return fraction(state) - TOLERANCE; } + public double maxStatePercent(final EfficiencyMonitoringThreadFactory.State state) { return 100*(fraction(state) + TOLERANCE); } + public double minStatePercent(final EfficiencyMonitoringThreadFactory.State state) { return 100*(fraction(state) - TOLERANCE); } - private double fraction(final Thread.State state) { + private double fraction(final EfficiencyMonitoringThreadFactory.State state) { return Collections.frequency(statesForThreads, state) / (1.0 * statesForThreads.size()); } } @@ -74,18 +78,16 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { * requested for input argument */ private static class StateTestThread implements Callable { - private final Thread.State stateToImplement; + private final EfficiencyMonitoringThreadFactory.State stateToImplement; - private StateTestThread(final Thread.State stateToImplement) { - if ( ! StateMonitoringThreadFactory.TRACKED_STATES.contains(stateToImplement) ) - throw new IllegalArgumentException("Unexpected state " + stateToImplement); + private StateTestThread(final EfficiencyMonitoringThreadFactory.State stateToImplement) { this.stateToImplement = stateToImplement; } @Override public Double call() throws Exception { switch ( stateToImplement ) { - case RUNNABLE: + case USER_CPU: // do some work until we get to THREAD_TARGET_DURATION_IN_MILLISECOND double sum = 0.0; final long startTime = System.currentTimeMillis(); @@ -96,13 +98,17 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { case WAITING: Thread.currentThread().sleep(THREAD_TARGET_DURATION_IN_MILLISECOND); return 0.0; - case BLOCKED: - if ( StateMonitoringThreadFactory.DEBUG ) logger.warn("Blocking..."); + case BLOCKING: + if ( EfficiencyMonitoringThreadFactory.DEBUG ) logger.warn("Blocking..."); synchronized (GLOBAL_LOCK) { // the GLOBAL_LOCK must be held by the unit test itself for this to properly block - if ( StateMonitoringThreadFactory.DEBUG ) logger.warn(" ... done blocking"); + if ( EfficiencyMonitoringThreadFactory.DEBUG ) logger.warn(" ... done blocking"); } return 0.0; + case WAITING_FOR_IO: + // TODO -- implement me + // shouldn't ever get here, throw an exception + throw new ReviewedStingException("WAITING_FOR_IO testing currently not implemented, until we figure out how to force a system call block"); default: throw new ReviewedStingException("Unexpected thread test state " + stateToImplement); } @@ -111,8 +117,11 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { @DataProvider(name = "StateTest") public Object[][] createStateTest() { - for ( final int nThreads : Arrays.asList(1, 2, 3, 4) ) { - for (final List states : Utils.makePermutations(StateMonitoringThreadFactory.TRACKED_STATES, nThreads, true) ) { + for ( final int nThreads : Arrays.asList(3) ) { + //final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.WAITING_FOR_IO); + final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.USER_CPU, EfficiencyMonitoringThreadFactory.State.WAITING, EfficiencyMonitoringThreadFactory.State.BLOCKING); + //final List allStates = Arrays.asList(EfficiencyMonitoringThreadFactory.State.values()); + for (final List states : Utils.makePermutations(allStates, nThreads, true) ) { //if ( Collections.frequency(states, Thread.State.BLOCKED) > 0) new StateTest(states); } @@ -121,16 +130,19 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { return StateTest.getTests(StateTest.class); } - @Test(enabled = false, dataProvider = "StateTest") + // NOTE this test takes an unreasonably long time to run, and so it's been disabled as these monitoring threads + // aren't a core GATK feature any longer. Should be reabled if we come to care about this capability again + // in the future, or we can run these in parallel + @Test(enabled = false, dataProvider = "StateTest", timeOut = MAX_THREADS * THREAD_TARGET_DURATION_IN_MILLISECOND) public void testStateTest(final StateTest test) throws InterruptedException { // allows us to test blocking - final StateMonitoringThreadFactory factory = new StateMonitoringThreadFactory(test.getNStates()); + final EfficiencyMonitoringThreadFactory factory = new EfficiencyMonitoringThreadFactory(test.getNStates()); final ExecutorService threadPool = Executors.newFixedThreadPool(test.getNStates(), factory); logger.warn("Running " + test); synchronized (GLOBAL_LOCK) { //logger.warn(" Have lock"); - for ( final Thread.State threadToRunState : test.getStatesForThreads() ) + for ( final EfficiencyMonitoringThreadFactory.State threadToRunState : test.getStatesForThreads() ) threadPool.submit(new StateTestThread(threadToRunState)); // lock has to be here for the whole running of the activeThreads but end before the sleep so the blocked activeThreads @@ -153,10 +165,10 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { Assert.assertTrue(totalTime >= minTime, "Factory results not properly accumulated: totalTime = " + totalTime + " < minTime = " + minTime); Assert.assertTrue(totalTime <= maxTime, "Factory results not properly accumulated: totalTime = " + totalTime + " > maxTime = " + maxTime); - for (final Thread.State state : StateMonitoringThreadFactory.TRACKED_STATES ) { - final double min = test.minStateFraction(state); - final double max = test.maxStateFraction(state); - final double obs = factory.getStateFraction(state); + for (final EfficiencyMonitoringThreadFactory.State state : EfficiencyMonitoringThreadFactory.State.values() ) { + final double min = test.minStatePercent(state); + final double max = test.maxStatePercent(state); + final double obs = factory.getStatePercent(state); // logger.warn(" Checking " + state // + " min " + String.format("%.2f", min) // + " max " + String.format("%.2f", max) @@ -170,6 +182,6 @@ public class StateMonitoringThreadFactoryUnitTest extends BaseTest { Assert.assertEquals(factory.getNThreadsCreated(), test.getNStates()); // should be called to ensure we don't format / NPE on output - factory.printUsageInformation(logger, Priority.INFO); + factory.printUsageInformation(logger, Priority.WARN); } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 26e2dbfbc..6785fa816 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -596,6 +596,51 @@ public class VariantContextTestProvider { return TEST_DATAs; } + public static void testReaderWriterWithMissingGenotypes(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { + final int nSamples = data.header.getNGenotypeSamples(); + if ( nSamples > 2 ) { + for ( final VariantContext vc : data.vcs ) + if ( vc.isSymbolic() ) + // cannot handle symbolic alleles because they may be weird non-call VCFs + return; + + final File tmpFile = File.createTempFile("testReaderWriter", tester.getExtension()); + tmpFile.deleteOnExit(); + + // write expected to disk + final EnumSet options = EnumSet.of(Options.INDEX_ON_THE_FLY); + final VariantContextWriter writer = tester.makeWriter(tmpFile, options); + + final Set samplesInVCF = new HashSet(data.header.getGenotypeSamples()); + final List missingSamples = Arrays.asList("MISSING1", "MISSING2"); + final List allSamples = new ArrayList(missingSamples); + allSamples.addAll(samplesInVCF); + + final VCFHeader header = new VCFHeader(data.header.getMetaDataInInputOrder(), allSamples); + writeVCsToFile(writer, header, data.vcs); + + // ensure writing of expected == actual + final Pair> p = readAllVCs(tmpFile, tester.makeCodec()); + final Iterable actual = p.getSecond(); + + int i = 0; + for ( final VariantContext readVC : actual ) { + if ( readVC == null ) continue; // sometimes we read null records... + final VariantContext expected = data.vcs.get(i++); + for ( final Genotype g : readVC.getGenotypes() ) { + Assert.assertTrue(allSamples.contains(g.getSampleName())); + if ( samplesInVCF.contains(g.getSampleName()) ) { + assertEquals(g, expected.getGenotype(g.getSampleName())); + } else { + // missing + Assert.assertTrue(g.isNoCall()); + } + } + } + + } + } + public static void testReaderWriter(final VariantContextIOTest tester, final VariantContextTestData data) throws IOException { testReaderWriter(tester, data.header, data.vcs, data.vcs, true); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java index 272166c68..19620b8df 100755 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java @@ -750,6 +750,10 @@ public class VariantContextUnitTest extends BaseTest { modified = new VariantContextBuilder(modified).attributes(null).attribute("AC", 1).make(); Assert.assertEquals(modified.getAttribute("AC"), 1); + // test the behavior when the builder's attribute object is not initialized + modified = new VariantContextBuilder(modified.getSource(), modified.getChr(), modified.getStart(), modified.getEnd(), modified.getAlleles()).attribute("AC", 1).make(); + + // test normal attribute modification modified = new VariantContextBuilder(cfg.vc).attribute("AC", 1).make(); Assert.assertEquals(modified.getAttribute("AC"), 1); modified = new VariantContextBuilder(modified).attribute("AC", 2).make(); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java index 95e8458c8..114104d42 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java @@ -598,8 +598,8 @@ public class VariantContextUtilsUnitTest extends BaseTest { private RepeatDetectorTest(boolean isTrueRepeat, String ref, String refAlleleString, String ... altAlleleStrings) { super(RepeatDetectorTest.class); - this.ref = "N" + ref; // add a dummy base for the event here this.isTrueRepeat = isTrueRepeat; + this.ref = ref; List alleles = new LinkedList(); final Allele refAllele = Allele.create(refAlleleString, true); @@ -609,7 +609,7 @@ public class VariantContextUtilsUnitTest extends BaseTest { alleles.add(alt); } - VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, 1 + refAllele.length(), alleles); + VariantContextBuilder builder = new VariantContextBuilder("test", "chr1", 1, refAllele.length(), alleles); this.vc = builder.make(); } @@ -620,31 +620,31 @@ public class VariantContextUtilsUnitTest extends BaseTest { @DataProvider(name = "RepeatDetectorTest") public Object[][] makeRepeatDetectorTest() { - new RepeatDetectorTest(true, "AAC", "-", "A"); - new RepeatDetectorTest(true, "AAC", "A", "-"); - new RepeatDetectorTest(false, "AAC", "AA", "-"); - new RepeatDetectorTest(false, "AAC", "-", "C"); + new RepeatDetectorTest(true, "NAAC", "N", "NA"); + new RepeatDetectorTest(true, "NAAC", "NA", "N"); + new RepeatDetectorTest(false, "NAAC", "NAA", "N"); + new RepeatDetectorTest(false, "NAAC", "N", "NC"); new RepeatDetectorTest(false, "AAC", "A", "C"); // running out of ref bases => false - new RepeatDetectorTest(false, "AAC", "-", "CAGTA"); + new RepeatDetectorTest(false, "NAAC", "N", "NCAGTA"); // complex repeats - new RepeatDetectorTest(true, "ATATATC", "-", "AT"); - new RepeatDetectorTest(true, "ATATATC", "-", "ATA"); - new RepeatDetectorTest(true, "ATATATC", "-", "ATAT"); - new RepeatDetectorTest(true, "ATATATC", "AT", "-"); - new RepeatDetectorTest(false, "ATATATC", "ATA", "-"); - new RepeatDetectorTest(false, "ATATATC", "ATAT", "-"); + new RepeatDetectorTest(true, "NATATATC", "N", "NAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATA", "N"); + new RepeatDetectorTest(false, "NATATATC", "NATAT", "N"); // multi-allelic - new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATAT"); - new RepeatDetectorTest(true, "ATATATC", "-", "AT", "ATA"); - new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATAT"); - new RepeatDetectorTest(true, "ATATATC", "AT", "-", "ATA"); // two As - new RepeatDetectorTest(false, "ATATATC", "AT", "-", "ATC"); // false - new RepeatDetectorTest(false, "ATATATC", "AT", "-", "CC"); // false - new RepeatDetectorTest(false, "ATATATC", "AT", "ATAT", "CC"); // false + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "N", "NAT", "NATA"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATAT"); + new RepeatDetectorTest(true, "NATATATC", "NAT", "N", "NATA"); // two As + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NATC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "N", "NCC"); // false + new RepeatDetectorTest(false, "NATATATC", "NAT", "NATAT", "NCC"); // false return RepeatDetectorTest.getTests(RepeatDetectorTest.class); } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java index 1b791bf6c..adf3eb235 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/writer/VariantContextWritersUnitTest.java @@ -82,6 +82,11 @@ public class VariantContextWritersUnitTest extends BaseTest { VariantContextTestProvider.testReaderWriter(new BCFIOTester(), testData); } + @Test(dataProvider = "VariantContextTest_SingleContexts") + public void testBCF2WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { + VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new BCFIOTester(), testData); + } + private class BCFIOTester extends VariantContextTestProvider.VariantContextIOTest { @Override public String getExtension() { @@ -110,6 +115,11 @@ public class VariantContextWritersUnitTest extends BaseTest { VariantContextTestProvider.testReaderWriter(new VCFIOTester(), testData); } + @Test(enabled = true, dataProvider = "VariantContextTest_SingleContexts") + public void testVCF4WriterReaderMissingGenotypes(final VariantContextTestProvider.VariantContextTestData testData) throws IOException { + VariantContextTestProvider.testReaderWriterWithMissingGenotypes(new VCFIOTester(), testData); + } + private class VCFIOTester extends VariantContextTestProvider.VariantContextIOTest { @Override public String getExtension() { diff --git a/public/perl/sortByRef.pl b/public/perl/sortByRef.pl index 71d3f4477..e17707796 100755 --- a/public/perl/sortByRef.pl +++ b/public/perl/sortByRef.pl @@ -50,7 +50,7 @@ my %ref_order; my $n = 0; while ( ) { chomp; - my ($contig, $rest) = split "\t"; + my ($contig, $rest) = split '\s'; die("Dictionary file is probably corrupt: multiple instances of contig $contig") if ( defined $ref_order{$contig} ); $ref_order{$contig} = $n; diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala index 56f6460fb..165e6a4e9 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala @@ -13,6 +13,7 @@ import net.sf.samtools.SAMFileHeader.SortOrder import org.broadinstitute.sting.queue.util.QScriptUtils import org.broadinstitute.sting.queue.function.ListWriterFunction import org.broadinstitute.sting.commandline.Hidden +import org.broadinstitute.sting.commandline class DataProcessingPipeline extends QScript { qscript => @@ -41,34 +42,34 @@ class DataProcessingPipeline extends QScript { @Input(doc="The path to the binary of bwa (usually BAM files have already been mapped - but if you want to remap this is the option)", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ - @Input(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false) + @Argument(doc="the project name determines the final output (BAM file) base name. Example NA12878 yields NA12878.processed.bam", fullName="project", shortName="p", required=false) var projectName: String = "project" - @Input(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false) + @Argument(doc="Output path for the processed BAM files.", fullName="output_directory", shortName="outputDir", required=false) var outputDir: String = "" - @Input(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false) + @Argument(doc="the -L interval string to be used by GATK - output bams at interval only", fullName="gatk_interval_string", shortName="L", required=false) var intervalString: String = "" @Input(doc="an intervals file to be used by GATK - output bams at intervals only", fullName="gatk_interval_file", shortName="intervals", required=false) var intervals: File = _ - @Input(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false) + @Argument(doc="Cleaning model: KNOWNS_ONLY, USE_READS or USE_SW", fullName="clean_model", shortName="cm", required=false) var cleaningModel: String = "USE_READS" - @Input(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false) + @Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Single Ended reads", fullName="use_bwa_single_ended", shortName="bwase", required=false) var useBWAse: Boolean = false - @Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false) + @Argument(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false) var useBWApe: Boolean = false - @Input(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false) + @Argument(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false) var useBWAsw: Boolean = false - @Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false) + @Argument(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false) var bwaThreads: Int = 1 - @Input(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false) + @Argument(doc="Perform validation on the BAM files", fullName="validation", shortName="vs", required=false) var validation: Boolean = false @@ -76,15 +77,15 @@ class DataProcessingPipeline extends QScript { * Hidden Parameters ****************************************************************************/ @Hidden - @Input(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false) + @Argument(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false) var nContigs: Int = -1 @Hidden - @Input(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false) + @Argument(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false) var defaultPlatform: String = "" @Hidden - @Input(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false) + @Argument(doc="Run the pipeline in test mode only", fullName = "test_mode", shortName = "test", required=false) var testMode: Boolean = false diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 3dc953361..24ab50451 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -122,21 +122,28 @@ class GATKResourcesBundle extends QScript { // // standard VCF files. Will be lifted to each reference // - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_135_b37.leftAligned.vcf", - "dbsnp_135", b37, true, false)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_137_b37.leftAligned.vcf", + "dbsnp_137", b37, true, false)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_genotypes_1525_samples.b37.vcf", - "1000G_omni2.5", b37, true, true)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/Omni2.5_chip/Omni25_sites_2141_samples.b37.vcf", + "1000G_omni2.5", b37, true, false)) - addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/genotypes_r27_nr.b37_fwd.vcf", - "hapmap_3.3", b37, true, true)) + addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf", + "hapmap_3.3", b37, true, false)) addResource(new Resource("/humgen/1kg/DCC/ftp/technical/working/20120312_phase1_v2_indel_cleaned_sites_list/ALL.wgs.phase1_release_v2.20101123.official_indel_calls.20120312.sites.vcf", "1000G_phase1.indels", b37, true, false)) addResource(new Resource("/humgen/gsa-hpprojects/GATK/data/Comparisons/Unvalidated/GoldStandardIndel/gold.standard.indel.MillsAnd1000G.b37.vcf", - "Mills_and_1000G_gold_standard.indels", b37, true, true)) - + "Mills_and_1000G_gold_standard.indels", b37, true, false)) + + // + // CEU trio (NA12878,NA12891,NA12892) best practices results (including PBT) + // + + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/callsets/CEUtrio_BestPractices/CEUTrio.HiSeq.WGS.b37.snps_and_indels.recalibrated.filtered.phased.CURRENT.vcf", + "CEUTrio.HiSeq.WGS.b37.bestPractices.phased",b37,true,false)) + // // example call set for wiki tutorial // @@ -310,6 +317,7 @@ class GATKResourcesBundle extends QScript { class UG(@Input bam: File, @Input ref: File, @Input outVCF: File) extends UnifiedGenotyper with UNIVERSAL_GATK_ARGS { this.input_file = List(bam) this.reference_sequence = ref + this.intervalsString ++= List("20"); this.out = outVCF } diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala index a4a6636fe..ef73840b3 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala @@ -27,28 +27,28 @@ class PacbioProcessingPipeline extends QScript { @Input(doc="dbsnp VCF file to use ", shortName="D", required=true) var dbSNP: File = _ - @Input(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) + @Argument(doc="Number of jobs to scatter/gather. Default: 0." , shortName = "sg", required=false) var threads: Int = 0 - @Input(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) + @Argument(doc="Sample Name to fill in the Read Group information (only necessary if using fasta/fastq)" , shortName = "sn", required=false) var sample: String = "NA" @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false) var bwaPath: File = _ - @Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false) + @Argument(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false) var BLASR_BAM: Boolean = false @Hidden - @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) + @Argument(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false) var dbq: Int = 20 @Hidden - @Input(shortName="bwastring", required=false) + @Argument(shortName="bwastring", required=false) var bwastring: String = "" @Hidden - @Input(shortName = "test", fullName = "test_mode", required = false) + @Argument(shortName = "test", fullName = "test_mode", required = false) var testMode: Boolean = false val queueLogDir: String = ".qlog/" diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala index 09a24e782..1cd5a7512 100644 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/examples/ExampleRetryMemoryLimit.scala @@ -10,13 +10,17 @@ class ExampleRetryMemoryLimit extends QScript { var bamFile: File = _ def script() { - val ug = new UnifiedGenotyper with RetryMemoryLimit - // First run with 1m - ug.memoryLimit = .001 - // On retry run with 1g - ug.retryMemoryFunction = (d => d * 1000) - ug.reference_sequence = referenceFile - ug.input_file = Seq(bamFile) - add(ug) + for (scatterCount <- 1 to 2) { + val ug = new UnifiedGenotyper with RetryMemoryLimit + // First run with 1m + ug.memoryLimit = .001 + // On retry run with 1g + ug.retryMemoryFunction = (d => d * 1000) + ug.reference_sequence = referenceFile + ug.input_file = Seq(bamFile) + ug.out = swapExt(bamFile, ".bam", ".scattered_%d.vcf".format(scatterCount)) + ug.scatterCount = scatterCount + add(ug) + } } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 041e84a8c..5b84bfd16 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -28,7 +28,7 @@ import function.QFunction import java.io.File import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.util._ -import org.broadinstitute.sting.queue.engine.{QGraphSettings, QGraph} +import org.broadinstitute.sting.queue.engine.{QStatusMessenger, QGraphSettings, QGraph} import collection.JavaConversions._ import org.broadinstitute.sting.utils.classloader.PluginManager import org.broadinstitute.sting.utils.exceptions.UserException @@ -64,10 +64,10 @@ object QCommandLine extends Logging { Runtime.getRuntime.removeShutdownHook(shutdownHook) qCommandLine.shutdown() } catch { - case _ => /* ignore, example 'java.lang.IllegalStateException: Shutdown in progress' */ + case e: Exception => /* ignore, example 'java.lang.IllegalStateException: Shutdown in progress' */ } if (CommandLineProgram.result != 0) - System.exit(CommandLineProgram.result); + System.exit(CommandLineProgram.result) } catch { case e: Exception => CommandLineProgram.exitSystemWithError(e) } @@ -90,28 +90,46 @@ class QCommandLine extends CommandLineProgram with Logging { private var qScriptClasses: File = _ private var shuttingDown = false - private lazy val pluginManager = { + private lazy val qScriptPluginManager = { qScriptClasses = IOUtils.tempDir("Q-Classes-", "", settings.qSettings.tempDirectory) qScriptManager.loadScripts(scripts, qScriptClasses) new PluginManager[QScript](classOf[QScript], Seq(qScriptClasses.toURI.toURL)) } - QFunction.parsingEngine = new ParsingEngine(this) + private lazy val qStatusMessengerPluginManager = { + new PluginManager[QStatusMessenger](classOf[QStatusMessenger]) + } + + ClassFieldCache.parsingEngine = new ParsingEngine(this) /** * Takes the QScripts passed in, runs their script() methods, retrieves their generated * functions, and then builds and runs a QGraph based on the dependencies. */ def execute = { + val allStatusMessengers = qStatusMessengerPluginManager.createAllTypes() + if (settings.qSettings.runName == null) settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) + if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory)) + settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp") + qGraph.initializeWithSettings(settings) - qGraph.settings = settings + for (statusMessenger <- allStatusMessengers) { + loadArgumentsIntoObject(statusMessenger) + } - val allQScripts = pluginManager.createAllTypes(); + for (statusMessenger <- allStatusMessengers) { + statusMessenger.started() + } + + val allQScripts = qScriptPluginManager.createAllTypes() for (script <- allQScripts) { - logger.info("Scripting " + pluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) + logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) loadArgumentsIntoObject(script) + // TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now. + //if (settings.run) + script.pullInputs() script.qSettings = settings.qSettings try { script.script() @@ -137,32 +155,22 @@ class QCommandLine extends CommandLineProgram with Logging { logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size)) - if (!settings.disableJobReport) { - val jobStringName = { - if (settings.jobReportFile != null) - settings.jobReportFile - else - settings.qSettings.runName + ".jobreport.txt" - } + // write the final complete job report + logger.info("Writing final jobs report...") + qGraph.writeJobsReport() - if (!shuttingDown) { - val reportFile = IOUtils.absolute(settings.qSettings.runDirectory, jobStringName) - logger.info("Writing JobLogging GATKReport to file " + reportFile) - QJobReport.printReport(functionsAndStatus, reportFile) - - if (settings.run) { - val pdfFile = IOUtils.absolute(settings.qSettings.runDirectory, FilenameUtils.removeExtension(jobStringName) + ".pdf") - logger.info("Plotting JobLogging GATKReport to file " + pdfFile) - QJobReport.plotReport(reportFile, pdfFile) - } - } - } - - if (!qGraph.success) { + if (!success) { logger.info("Done with errors") qGraph.logFailed() + for (statusMessenger <- allStatusMessengers) + statusMessenger.exit("Done with errors") 1 } else { + if (settings.run) { + allQScripts.foreach(_.pushOutputs()) + for (statusMessenger <- allStatusMessengers) + statusMessenger.done(allQScripts.map(_.remoteOutputs)) + } 0 } } @@ -174,19 +182,30 @@ class QCommandLine extends CommandLineProgram with Logging { override def canAddArgumentsDynamically = true /** - * Returns the list of QScripts passed in via -S so that their - * arguments can be inspected before QScript.script is called. - * @return Array of QScripts passed in. + * Returns the list of QScripts passed in via -S and other plugins + * so that their arguments can be inspected before QScript.script is called. + * @return Array of dynamic sources */ - override def getArgumentSources = - pluginManager.getPlugins.toIterable.toArray.asInstanceOf[Array[Class[_]]] + override def getArgumentSources = { + var plugins = Seq.empty[Class[_]] + plugins ++= qScriptPluginManager.getPlugins + plugins ++= qStatusMessengerPluginManager.getPlugins + plugins.toArray + } /** - * Returns the name of a QScript - * @return The name of a QScript + * Returns the name of a script/plugin + * @return The name of a script/plugin */ - override def getArgumentSourceName(source: Class[_]) = - pluginManager.getName(source.asSubclass(classOf[QScript])) + override def getArgumentSourceName(source: Class[_]) = { + if (classOf[QScript].isAssignableFrom(source)) + qScriptPluginManager.getName(source.asSubclass(classOf[QScript])) + else if (classOf[QStatusMessenger].isAssignableFrom(source)) + qStatusMessengerPluginManager.getName(source.asSubclass(classOf[QStatusMessenger])) + else + null + + } /** * Returns a ScalaCompoundArgumentTypeDescriptor that can parse argument sources into scala collections. @@ -205,7 +224,7 @@ class QCommandLine extends CommandLineProgram with Logging { private def createQueueHeader() : Seq[String] = { Seq(String.format("Queue v%s, Compiled %s", getQueueVersion, getBuildTimestamp), "Copyright (c) 2012 The Broad Institute", - "Fro support and documentation go to http://www.broadinstitute.org/gatk") + "For support and documentation go to http://www.broadinstitute.org/gatk") } private def getQueueVersion : String = { diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index 6f887ea00..2dcfb916c 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -27,7 +27,8 @@ package org.broadinstitute.sting.queue import engine.JobRunInfo import org.broadinstitute.sting.queue.function.QFunction import annotation.target.field -import util.{StringFileConversions, PrimitiveOptionConversions, Logging} +import util._ +import org.broadinstitute.sting.commandline.ArgumentSource /** * Defines a Queue pipeline as a collection of CommandLineFunctions. @@ -106,6 +107,37 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon def addAll(functions: Seq[QFunction]) { functions.foreach( f => add(f) ) } + + def pullInputs() { + val inputs = ClassFieldCache.getFieldFiles(this, inputFields) + for (remoteFile <- filterRemoteFiles(inputs)) { + logger.info("Pulling %s from %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) + remoteFile.pullToLocal() + } + } + + def pushOutputs() { + val outputs = ClassFieldCache.getFieldFiles(this, outputFields) + for (remoteFile <- filterRemoteFiles(outputs)) { + logger.info("Pushing %s to %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) + remoteFile.pushToRemote() + } + } + + def remoteOutputs: Map[ArgumentSource, Seq[RemoteFile]] = + outputFields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap + + private def filterRemoteFiles(fields: Seq[File]): Seq[RemoteFile] = + fields.filter(field => field != null && field.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile]) + + /** The complete list of fields. */ + def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.getClass) + /** The @Input fields. */ + def inputFields: Seq[ArgumentSource] = ClassFieldCache.classInputFields(this.getClass) + /** The @Output fields. */ + def outputFields: Seq[ArgumentSource] = ClassFieldCache.classOutputFields(this.getClass) + /** The @Argument fields. */ + def argumentFields: Seq[ArgumentSource] = ClassFieldCache.classArgumentFields(this.getClass) } object QScript { diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala index 74487917f..2528c0572 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScriptManager.scala @@ -11,6 +11,7 @@ import org.apache.log4j.Level import scala.tools.nsc.util.{FakePos, NoPosition, Position} import org.broadinstitute.sting.queue.util.TextFormatUtils._ import org.broadinstitute.sting.utils.classloader.JVMUtils +import tools.util.StringOps /** * Plugin manager for QScripts which loads QScripts into the current class loader. @@ -63,7 +64,7 @@ object QScriptManager extends Logging { * Heavily based on scala/src/compiler/scala/tools/nsc/reporters/ConsoleReporter.scala */ private class Log4JReporter(val settings: Settings) extends AbstractReporter { - def displayPrompt { throw new UnsupportedOperationException("Unable to prompt the user. Prompting should be off.") } + def displayPrompt() { throw new UnsupportedOperationException("Unable to prompt the user. Prompting should be off.") } /** * Displays the message at position with severity. @@ -98,9 +99,9 @@ object QScriptManager extends Logging { */ def printSummary() { if (WARNING.count > 0) - printMessage(Level.WARN, countElementsAsString(WARNING.count, "warning") + " found") + printMessage(Level.WARN, StringOps.countElementsAsString(WARNING.count, "warning") + " found") if (ERROR.count > 0) - printMessage(Level.ERROR, countElementsAsString(ERROR.count, "error") + " found") + printMessage(Level.ERROR, StringOps.countElementsAsString(ERROR.count, "error") + " found") } /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala index 1a50301f1..2c0f43bac 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QSettings.scala @@ -25,7 +25,7 @@ package org.broadinstitute.sting.queue import java.io.File -import org.broadinstitute.sting.commandline.Argument +import org.broadinstitute.sting.commandline.{ClassType, Argument} /** * Default settings settable on the command line and passed to CommandLineFunctions. @@ -41,6 +41,7 @@ class QSettings { var jobQueue: String = _ @Argument(fullName="job_priority", shortName="jobPriority", doc="Default priority for jobs. Min = 0, Max = 100", required=false) + @ClassType(classOf[Int]) var jobPriority: Option[Int] = None @Argument(fullName="job_native_arg", shortName="jobNative", doc="Native arguments to pass to the job runner.", required=false) @@ -52,16 +53,20 @@ class QSettings { @Argument(fullName="job_environment_name", shortName="jobEnv", doc="Environment names for the job runner.", required=false) var jobEnvironmentNames: Seq[String] = Nil - @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes.", required=false) - var memoryLimit: Option[Double] = None + @Argument(fullName="memory_limit", shortName="memLimit", doc="Default memory limit for jobs, in gigabytes. If not set defaults to 2GB.", required=false) + @ClassType(classOf[Double]) + var memoryLimit: Option[Double] = Some(2) @Argument(fullName="memory_limit_threshold", shortName="memLimitThresh", doc="After passing this threshold stop increasing memory limit for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) var memoryLimitThreshold: Option[Double] = None @Argument(fullName="resident_memory_limit", shortName="resMemLimit", doc="Default resident memory limit for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) var residentLimit: Option[Double] = None @Argument(fullName="resident_memory_request", shortName="resMemReq", doc="Default resident memory request for jobs, in gigabytes.", required=false) + @ClassType(classOf[Double]) var residentRequest: Option[Double] = None @Argument(fullName="resident_memory_request_parameter", shortName="resMemReqParam", doc="Parameter for resident memory requests. By default not requested.", required=false) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala index 2d4ff60f5..62c016812 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/FunctionEdge.scala @@ -185,7 +185,7 @@ class FunctionEdge(val function: QFunction, val inputs: QNode, val outputs: QNod val tailLines = IOUtils.tail(errorFile, maxLines) val nl = "%n".format() val summary = if (tailLines.size > maxLines) "Last %d lines".format(maxLines) else "Contents" - this.function.jobErrorLines = collection.JavaConversions.asScalaIterable(tailLines).toSeq + this.function.jobErrorLines = collection.JavaConversions.collectionAsScalaIterable(tailLines).toSeq logger.error("%s of %s:%n%s".format(summary, errorFile, StringUtils.join(tailLines, nl))) } else { logger.error("Unable to access log file: %s".format(errorFile)) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala index e3a1714ff..2c33596e1 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QGraph.scala @@ -39,7 +39,7 @@ import collection.immutable.{TreeSet, TreeMap} import org.broadinstitute.sting.queue.function.scattergather.{ScatterFunction, CloneFunction, GatherFunction, ScatterGatherableFunction} import java.util.Date import org.broadinstitute.sting.utils.Utils -import org.apache.commons.io.{FileUtils, IOUtils} +import org.apache.commons.io.{FilenameUtils, FileUtils, IOUtils} import java.io.{OutputStreamWriter, File} /** @@ -71,6 +71,16 @@ class QGraph extends Logging { private val inProcessManager = new InProcessJobManager private def managers = Seq[Any](inProcessManager, commandLineManager) + /** + * If true, we will write out incremental job reports + */ + private val INCREMENTAL_JOBS_REPORT = true + + /** + * Holds the optional jobInfoReporter structure + */ + private var jobInfoReporter: QJobsReporter = null + private class StatusCounts { var pending = 0 var running = 0 @@ -79,6 +89,19 @@ class QGraph extends Logging { } private val statusCounts = new StatusCounts + /** + * Final initialization step of this QGraph -- tell it runtime setting information + * + * The settings aren't necessarily available until after this QGraph object has been constructed, so + * this function must be called once the QGraphSettings have been filled in. + * + * @param settings + */ + def initializeWithSettings(settings: QGraphSettings) { + this.settings = settings + this.jobInfoReporter = createJobsReporter() + } + /** * Adds a QScript created CommandLineFunction to the graph. * @param command Function to add to the graph. @@ -467,6 +490,12 @@ class QGraph extends Logging { checkRetryJobs(failedJobs) } + // incremental + if ( logNextStatusCounts && INCREMENTAL_JOBS_REPORT ) { + logger.info("Writing incremental jobs reports...") + writeJobsReport(false) + } + readyJobs ++= getReadyJobs } @@ -1084,6 +1113,39 @@ class QGraph extends Logging { } } + /** + * Create the jobsReporter for this QGraph, based on the settings data. + * + * Must be called after settings has been initialized properly + * + * @return + */ + private def createJobsReporter(): QJobsReporter = { + val jobStringName = if (settings.jobReportFile != null) + settings.jobReportFile + else + settings.qSettings.runName + ".jobreport.txt" + + val reportFile = org.broadinstitute.sting.utils.io.IOUtils.absolute(settings.qSettings.runDirectory, jobStringName) + + val pdfFile = if ( settings.run ) + Some(org.broadinstitute.sting.utils.io.IOUtils.absolute(settings.qSettings.runDirectory, FilenameUtils.removeExtension(jobStringName) + ".pdf")) + else + None + + new QJobsReporter(settings.disableJobReport, reportFile, pdfFile) + } + + /** + * Write, if possible, the jobs report + */ + def writeJobsReport(plot: Boolean = true) { + // note: the previous logic didn't write the job report if the system was shutting down, but I don't + // see any reason not to write the job report + if ( jobInfoReporter != null ) + jobInfoReporter.write(this, plot) + } + /** * Returns true if the graph was shutdown instead of exiting on its own. */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala new file mode 100644 index 000000000..eeabe6d1d --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala @@ -0,0 +1,13 @@ +package org.broadinstitute.sting.queue.engine + +import org.broadinstitute.sting.commandline.ArgumentSource +import org.broadinstitute.sting.queue.util.RemoteFile + +/** + * Plugin to sends QStatus messages + */ +trait QStatusMessenger { + def started() + def done(files: Seq[Map[ArgumentSource, Seq[RemoteFile]]]) + def exit(message: String) +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala new file mode 100644 index 000000000..1193e7dec --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/cancer/MuTect.scala @@ -0,0 +1,407 @@ +package org.broadinstitute.sting.queue.extensions.cancer + +import java.io.File +import org.broadinstitute.sting.commandline.Argument +import org.broadinstitute.sting.commandline.Gather +import org.broadinstitute.sting.commandline.Input +import org.broadinstitute.sting.commandline.Output +import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction +import org.broadinstitute.sting.queue.extensions.gatk.{TaggedFile, VcfGatherFunction, LocusScatterFunction} + +class MuTect extends org.broadinstitute.sting.queue.extensions.gatk.CommandLineGATK with ScatterGatherableFunction { + analysisName = "MuTect" + analysis_type = "MuTect" + scatterClass = classOf[LocusScatterFunction] + + /** used for debugging, basically exit as soon as we get the reads */ + @Argument(fullName="noop", shortName="", doc="used for debugging, basically exit as soon as we get the reads", required=false, exclusiveOf="", validation="") + var noop: Boolean = _ + + /** add many additional columns of statistics to the output file */ + @Argument(fullName="enable_extended_output", shortName="", doc="add many additional columns of statistics to the output file", required=false, exclusiveOf="", validation="") + var enable_extended_output: Boolean = _ + + /** used when running the caller on a normal (as if it were a tumor) to detect artifacts */ + @Argument(fullName="artifact_detection_mode", shortName="", doc="used when running the caller on a normal (as if it were a tumor) to detect artifacts", required=false, exclusiveOf="", validation="") + var artifact_detection_mode: Boolean = _ + + /** name to use for tumor in output files */ + @Argument(fullName="tumor_sample_name", shortName="", doc="name to use for tumor in output files", required=false, exclusiveOf="", validation="") + var tumor_sample_name: String = _ + + /** if the tumor bam contains multiple samples, only use read groups with SM equal to this value */ + @Argument(fullName="bam_tumor_sample_name", shortName="", doc="if the tumor bam contains multiple samples, only use read groups with SM equal to this value", required=false, exclusiveOf="", validation="") + var bam_tumor_sample_name: String = _ + + /** name to use for normal in output files */ + @Argument(fullName="normal_sample_name", shortName="", doc="name to use for normal in output files", required=false, exclusiveOf="", validation="") + var normal_sample_name: String = _ + + /** force output for each site */ + @Argument(fullName="force_output", shortName="", doc="force output for each site", required=false, exclusiveOf="", validation="") + var force_output: Boolean = _ + + /** force output for all alleles at each site */ + @Argument(fullName="force_alleles", shortName="", doc="force output for all alleles at each site", required=false, exclusiveOf="", validation="") + var force_alleles: Boolean = _ + + /** only emit passing calls */ + @Argument(fullName="only_passing_calls", shortName="", doc="only emit passing calls", required=false, exclusiveOf="", validation="") + var only_passing_calls: Boolean = _ + + /** Initial LOD threshold for calling tumor variant */ + @Argument(fullName="initial_tumor_lod", shortName="", doc="Initial LOD threshold for calling tumor variant", required=false, exclusiveOf="", validation="") + var initial_tumor_lod: Option[Float] = None + + /** Format string for initial_tumor_lod */ + @Argument(fullName="initial_tumor_lodFormat", shortName="", doc="Format string for initial_tumor_lod", required=false, exclusiveOf="", validation="") + var initial_tumor_lodFormat: String = "%s" + + /** LOD threshold for calling tumor variant */ + @Argument(fullName="tumor_lod", shortName="", doc="LOD threshold for calling tumor variant", required=false, exclusiveOf="", validation="") + var tumor_lod: Option[Float] = None + + /** Format string for tumor_lod */ + @Argument(fullName="tumor_lodFormat", shortName="", doc="Format string for tumor_lod", required=false, exclusiveOf="", validation="") + var tumor_lodFormat: String = "%s" + + /** estimate of fraction (0-1) of physical contamination with other unrelated samples */ + @Argument(fullName="fraction_contamination", shortName="", doc="estimate of fraction (0-1) of physical contamination with other unrelated samples", required=false, exclusiveOf="", validation="") + var fraction_contamination: Option[Float] = None + + /** Format string for fraction_contamination */ + @Argument(fullName="fraction_contaminationFormat", shortName="", doc="Format string for fraction_contamination", required=false, exclusiveOf="", validation="") + var fraction_contaminationFormat: String = "%s" + + /** minimum fraction of cells which are presumed to have a mutation, used to handle non-clonality and contamination */ + @Argument(fullName="minimum_mutation_cell_fraction", shortName="", doc="minimum fraction of cells which are presumed to have a mutation, used to handle non-clonality and contamination", required=false, exclusiveOf="", validation="") + var minimum_mutation_cell_fraction: Option[Float] = None + + /** Format string for minimum_mutation_cell_fraction */ + @Argument(fullName="minimum_mutation_cell_fractionFormat", shortName="", doc="Format string for minimum_mutation_cell_fraction", required=false, exclusiveOf="", validation="") + var minimum_mutation_cell_fractionFormat: String = "%s" + + /** LOD threshold for calling normal non-germline */ + @Argument(fullName="normal_lod", shortName="", doc="LOD threshold for calling normal non-germline", required=false, exclusiveOf="", validation="") + var normal_lod: Option[Float] = None + + /** Format string for normal_lod */ + @Argument(fullName="normal_lodFormat", shortName="", doc="Format string for normal_lod", required=false, exclusiveOf="", validation="") + var normal_lodFormat: String = "%s" + + /** LOD threshold for calling normal non-variant */ + @Argument(fullName="normal_artifact_lod", shortName="", doc="LOD threshold for calling normal non-variant", required=false, exclusiveOf="", validation="") + var normal_artifact_lod: Option[Float] = None + + /** Format string for normal_artifact_lod */ + @Argument(fullName="normal_artifact_lodFormat", shortName="", doc="Format string for normal_artifact_lod", required=false, exclusiveOf="", validation="") + var normal_artifact_lodFormat: String = "%s" + + /** LOD threshold for calling strand bias */ + @Argument(fullName="strand_artifact_lod", shortName="", doc="LOD threshold for calling strand bias", required=false, exclusiveOf="", validation="") + var strand_artifact_lod: Option[Float] = None + + /** Format string for strand_artifact_lod */ + @Argument(fullName="strand_artifact_lodFormat", shortName="", doc="Format string for strand_artifact_lod", required=false, exclusiveOf="", validation="") + var strand_artifact_lodFormat: String = "%s" + + /** power threshold for calling strand bias */ + @Argument(fullName="strand_artifact_power_threshold", shortName="", doc="power threshold for calling strand bias", required=false, exclusiveOf="", validation="") + var strand_artifact_power_threshold: Option[Float] = None + + /** Format string for strand_artifact_power_threshold */ + @Argument(fullName="strand_artifact_power_thresholdFormat", shortName="", doc="Format string for strand_artifact_power_threshold", required=false, exclusiveOf="", validation="") + var strand_artifact_power_thresholdFormat: String = "%s" + + /** LOD threshold for calling normal non-variant at dbsnp sites */ + @Argument(fullName="dbsnp_normal_lod", shortName="", doc="LOD threshold for calling normal non-variant at dbsnp sites", required=false, exclusiveOf="", validation="") + var dbsnp_normal_lod: Option[Float] = None + + /** Format string for dbsnp_normal_lod */ + @Argument(fullName="dbsnp_normal_lodFormat", shortName="", doc="Format string for dbsnp_normal_lod", required=false, exclusiveOf="", validation="") + var dbsnp_normal_lodFormat: String = "%s" + + /** Power threshold for normal to determine germline vs variant */ + @Argument(fullName="somatic_classification_normal_power_threshold", shortName="", doc="Power threshold for normal to determine germline vs variant", required=false, exclusiveOf="", validation="") + var somatic_classification_normal_power_threshold: Option[Float] = None + + /** Format string for somatic_classification_normal_power_threshold */ + @Argument(fullName="somatic_classification_normal_power_thresholdFormat", shortName="", doc="Format string for somatic_classification_normal_power_threshold", required=false, exclusiveOf="", validation="") + var somatic_classification_normal_power_thresholdFormat: String = "%s" + + /** minimum allele fraction to be considered in normal, useful for normal sample contaminated with tumor */ + @Argument(fullName="minimum_normal_allele_fraction", shortName="", doc="minimum allele fraction to be considered in normal, useful for normal sample contaminated with tumor", required=false, exclusiveOf="", validation="") + var minimum_normal_allele_fraction: Option[Float] = None + + /** Format string for minimum_normal_allele_fraction */ + @Argument(fullName="minimum_normal_allele_fractionFormat", shortName="", doc="Format string for minimum_normal_allele_fraction", required=false, exclusiveOf="", validation="") + var minimum_normal_allele_fractionFormat: String = "%s" + + /** for computational efficiency, reject sites with allelic fraction below this threshold */ + @Argument(fullName="tumor_f_pretest", shortName="", doc="for computational efficiency, reject sites with allelic fraction below this threshold", required=false, exclusiveOf="", validation="") + var tumor_f_pretest: Option[Float] = None + + /** Format string for tumor_f_pretest */ + @Argument(fullName="tumor_f_pretestFormat", shortName="", doc="Format string for tumor_f_pretest", required=false, exclusiveOf="", validation="") + var tumor_f_pretestFormat: String = "%s" + + /** threshold for minimum base quality score */ + @Argument(fullName="min_qscore", shortName="", doc="threshold for minimum base quality score", required=false, exclusiveOf="", validation="") + var min_qscore: Option[Int] = None + + /** how many gapped events (ins/del) are allowed in proximity to this candidate */ + @Argument(fullName="gap_events_threshold", shortName="", doc="how many gapped events (ins/del) are allowed in proximity to this candidate", required=false, exclusiveOf="", validation="") + var gap_events_threshold: Option[Int] = None + + /** if this fraction or more of the bases in a read are soft/hard clipped, do not use this read for mutation calling */ + @Argument(fullName="heavily_clipped_read_fraction", shortName="", doc="if this fraction or more of the bases in a read are soft/hard clipped, do not use this read for mutation calling", required=false, exclusiveOf="", validation="") + var heavily_clipped_read_fraction: Option[Float] = None + + /** Format string for heavily_clipped_read_fraction */ + @Argument(fullName="heavily_clipped_read_fractionFormat", shortName="", doc="Format string for heavily_clipped_read_fraction", required=false, exclusiveOf="", validation="") + var heavily_clipped_read_fractionFormat: String = "%s" + + /** pvalue threshold for fishers exact test of clipping bias in mutant reads vs ref reads */ + @Argument(fullName="clipping_bias_pvalue_threshold", shortName="", doc="pvalue threshold for fishers exact test of clipping bias in mutant reads vs ref reads", required=false, exclusiveOf="", validation="") + var clipping_bias_pvalue_threshold: Option[Float] = None + + /** Format string for clipping_bias_pvalue_threshold */ + @Argument(fullName="clipping_bias_pvalue_thresholdFormat", shortName="", doc="Format string for clipping_bias_pvalue_threshold", required=false, exclusiveOf="", validation="") + var clipping_bias_pvalue_thresholdFormat: String = "%s" + + /** threshold for determining if there is relatedness between the alt and ref allele read piles */ + @Argument(fullName="fraction_mapq0_threshold", shortName="", doc="threshold for determining if there is relatedness between the alt and ref allele read piles", required=false, exclusiveOf="", validation="") + var fraction_mapq0_threshold: Option[Float] = None + + /** Format string for fraction_mapq0_threshold */ + @Argument(fullName="fraction_mapq0_thresholdFormat", shortName="", doc="Format string for fraction_mapq0_threshold", required=false, exclusiveOf="", validation="") + var fraction_mapq0_thresholdFormat: String = "%s" + + /** threshold for clustered read position artifact median */ + @Argument(fullName="pir_median_threshold", shortName="", doc="threshold for clustered read position artifact median", required=false, exclusiveOf="", validation="") + var pir_median_threshold: Option[Double] = None + + /** Format string for pir_median_threshold */ + @Argument(fullName="pir_median_thresholdFormat", shortName="", doc="Format string for pir_median_threshold", required=false, exclusiveOf="", validation="") + var pir_median_thresholdFormat: String = "%s" + + /** threshold for clustered read position artifact MAD */ + @Argument(fullName="pir_mad_threshold", shortName="", doc="threshold for clustered read position artifact MAD", required=false, exclusiveOf="", validation="") + var pir_mad_threshold: Option[Double] = None + + /** Format string for pir_mad_threshold */ + @Argument(fullName="pir_mad_thresholdFormat", shortName="", doc="Format string for pir_mad_threshold", required=false, exclusiveOf="", validation="") + var pir_mad_thresholdFormat: String = "%s" + + /** required minimum value for tumor alt allele maximum mapping quality score */ + @Argument(fullName="required_maximum_alt_allele_mapping_quality_score", shortName="", doc="required minimum value for tumor alt allele maximum mapping quality score", required=false, exclusiveOf="", validation="") + var required_maximum_alt_allele_mapping_quality_score: Option[Int] = None + + /** threshold for maximum alternate allele counts in normal */ + @Argument(fullName="max_alt_alleles_in_normal_count", shortName="", doc="threshold for maximum alternate allele counts in normal", required=false, exclusiveOf="", validation="") + var max_alt_alleles_in_normal_count: Option[Int] = None + + /** threshold for maximum alternate allele quality score sum in normal */ + @Argument(fullName="max_alt_alleles_in_normal_qscore_sum", shortName="", doc="threshold for maximum alternate allele quality score sum in normal", required=false, exclusiveOf="", validation="") + var max_alt_alleles_in_normal_qscore_sum: Option[Int] = None + + /** threshold for maximum alternate allele fraction in normal */ + @Argument(fullName="max_alt_allele_in_normal_fraction", shortName="", doc="threshold for maximum alternate allele fraction in normal", required=false, exclusiveOf="", validation="") + var max_alt_allele_in_normal_fraction: Option[Double] = None + + /** Format string for max_alt_allele_in_normal_fraction */ + @Argument(fullName="max_alt_allele_in_normal_fractionFormat", shortName="", doc="Format string for max_alt_allele_in_normal_fraction", required=false, exclusiveOf="", validation="") + var max_alt_allele_in_normal_fractionFormat: String = "%s" + + /** Phred scale quality score constant to use in power calculations */ + @Argument(fullName="power_constant_qscore", shortName="", doc="Phred scale quality score constant to use in power calculations", required=false, exclusiveOf="", validation="") + var power_constant_qscore: Option[Int] = None + + /** Absolute Copy Number Data, as defined by Absolute, to use in power calculations */ + @Argument(fullName="absolute_copy_number_data", shortName="", doc="Absolute Copy Number Data, as defined by Absolute, to use in power calculations", required=false, exclusiveOf="", validation="") + var absolute_copy_number_data: File = _ + + /** Allelic fraction constant to use in power calculations */ + @Argument(fullName="power_constant_af", shortName="", doc="Allelic fraction constant to use in power calculations", required=false, exclusiveOf="", validation="") + var power_constant_af: Option[Double] = None + + /** Format string for power_constant_af */ + @Argument(fullName="power_constant_afFormat", shortName="", doc="Format string for power_constant_af", required=false, exclusiveOf="", validation="") + var power_constant_afFormat: String = "%s" + + /** Call-stats output */ + @Output(fullName="out", shortName="o", doc="Call-stats output", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var out: File = _ + + /** + * Short name of out + * @return Short name of out + */ + def o = this.out + + /** + * Short name of out + * @param value Short name of out + */ + def o_=(value: File) { this.out = value } + + /** VCF output of mutation candidates */ + @Output(fullName="vcf", shortName="vcf", doc="VCF output of mutation candidates", required=false, exclusiveOf="", validation="") + @Gather(classOf[VcfGatherFunction]) + var vcf: File = _ + + /** Automatically generated index for vcf */ + @Output(fullName="vcfIndex", shortName="", doc="Automatically generated index for vcf", required=false, exclusiveOf="", validation="") + @Gather(enabled=false) + private var vcfIndex: File = _ + + /** Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests. */ + @Argument(fullName="no_cmdline_in_header", shortName="no_cmdline_in_header", doc="Don't output the usual VCF header tag with the command line. FOR DEBUGGING PURPOSES ONLY. This option is required in order to pass integration tests.", required=false, exclusiveOf="", validation="") + var no_cmdline_in_header: Boolean = _ + + /** Just output sites without genotypes (i.e. only the first 8 columns of the VCF) */ + @Argument(fullName="sites_only", shortName="sites_only", doc="Just output sites without genotypes (i.e. only the first 8 columns of the VCF)", required=false, exclusiveOf="", validation="") + var sites_only: Boolean = _ + + /** force BCF output, regardless of the file's extension */ + @Argument(fullName="bcf", shortName="bcf", doc="force BCF output, regardless of the file's extension", required=false, exclusiveOf="", validation="") + var bcf: Boolean = _ + + /** VCF file of DBSNP information */ + @Input(fullName="dbsnp", shortName="dbsnp", doc="VCF file of DBSNP information", required=false, exclusiveOf="", validation="") + var dbsnp: Seq[File] = Nil + + /** Dependencies on any indexes of dbsnp */ + @Input(fullName="dbsnpIndexes", shortName="", doc="Dependencies on any indexes of dbsnp", required=false, exclusiveOf="", validation="") + private var dbsnpIndexes: Seq[File] = Nil + + /** VCF file of COSMIC sites */ + @Input(fullName="cosmic", shortName="cosmic", doc="VCF file of COSMIC sites", required=false, exclusiveOf="", validation="") + var cosmic: Seq[File] = Nil + + /** Dependencies on any indexes of cosmic */ + @Input(fullName="cosmicIndexes", shortName="", doc="Dependencies on any indexes of cosmic", required=false, exclusiveOf="", validation="") + private var cosmicIndexes: Seq[File] = Nil + + /** VCF file of sites observed in normal */ + @Input(fullName="normal_panel", shortName="normal_panel", doc="VCF file of sites observed in normal", required=false, exclusiveOf="", validation="") + var normal_panel: Seq[File] = Nil + + /** Dependencies on any indexes of normal_panel */ + @Input(fullName="normal_panelIndexes", shortName="", doc="Dependencies on any indexes of normal_panel", required=false, exclusiveOf="", validation="") + private var normal_panelIndexes: Seq[File] = Nil + + /** write out coverage in WIGGLE format to this file */ + @Output(fullName="coverage_file", shortName="cov", doc="write out coverage in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var coverage_file: File = _ + + /** + * Short name of coverage_file + * @return Short name of coverage_file + */ + def cov = this.coverage_file + + /** + * Short name of coverage_file + * @param value Short name of coverage_file + */ + def cov_=(value: File) { this.coverage_file = value } + + /** write out 20x of Q20 coverage in WIGGLE format to this file */ + @Output(fullName="coverage_20_q20_file", shortName="cov_q20", doc="write out 20x of Q20 coverage in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var coverage_20_q20_file: File = _ + + /** + * Short name of coverage_20_q20_file + * @return Short name of coverage_20_q20_file + */ + def cov_q20 = this.coverage_20_q20_file + + /** + * Short name of coverage_20_q20_file + * @param value Short name of coverage_20_q20_file + */ + def cov_q20_=(value: File) { this.coverage_20_q20_file = value } + + /** write out power in WIGGLE format to this file */ + @Output(fullName="power_file", shortName="pow", doc="write out power in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var power_file: File = _ + + /** + * Short name of power_file + * @return Short name of power_file + */ + def pow = this.power_file + + /** + * Short name of power_file + * @param value Short name of power_file + */ + def pow_=(value: File) { this.power_file = value } + + /** write out tumor read depth in WIGGLE format to this file */ + @Output(fullName="tumor_depth_file", shortName="tdf", doc="write out tumor read depth in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var tumor_depth_file: File = _ + + /** + * Short name of tumor_depth_file + * @return Short name of tumor_depth_file + */ + def tdf = this.tumor_depth_file + + /** + * Short name of tumor_depth_file + * @param value Short name of tumor_depth_file + */ + def tdf_=(value: File) { this.tumor_depth_file = value } + + /** write out normal read depth in WIGGLE format to this file */ + @Output(fullName="normal_depth_file", shortName="ndf", doc="write out normal read depth in WIGGLE format to this file", required=false, exclusiveOf="", validation="") + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var normal_depth_file: File = _ + + /** + * Short name of normal_depth_file + * @return Short name of normal_depth_file + */ + def ndf = this.normal_depth_file + + /** + * Short name of normal_depth_file + * @param value Short name of normal_depth_file + */ + def ndf_=(value: File) { this.normal_depth_file = value } + + /** if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up. */ + @Argument(fullName="filter_mismatching_base_and_quals", shortName="filterMBQ", doc="if a read has mismatching number of bases and base qualities, filter out the read instead of blowing up.", required=false, exclusiveOf="", validation="") + var filter_mismatching_base_and_quals: Boolean = _ + + /** + * Short name of filter_mismatching_base_and_quals + * @return Short name of filter_mismatching_base_and_quals + */ + def filterMBQ = this.filter_mismatching_base_and_quals + + /** + * Short name of filter_mismatching_base_and_quals + * @param value Short name of filter_mismatching_base_and_quals + */ + def filterMBQ_=(value: Boolean) { this.filter_mismatching_base_and_quals = value } + + override def freezeFieldValues() { + super.freezeFieldValues() + if (vcf != null && !org.broadinstitute.sting.utils.io.IOUtils.isSpecialFile(vcf)) + if (!org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor.isCompressed(vcf.getPath)) + vcfIndex = new File(vcf.getPath + ".idx") + dbsnpIndexes ++= dbsnp.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) + cosmicIndexes ++= cosmic.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) + normal_panelIndexes ++= normal_panel.filter(orig => orig != null).map(orig => new File(orig.getPath + ".idx")) + } + + override def commandLine = super.commandLine + conditional(noop, "--noop", escape=true, format="%s") + conditional(enable_extended_output, "--enable_extended_output", escape=true, format="%s") + conditional(artifact_detection_mode, "--artifact_detection_mode", escape=true, format="%s") + optional("--tumor_sample_name", tumor_sample_name, spaceSeparated=true, escape=true, format="%s") + optional("--bam_tumor_sample_name", bam_tumor_sample_name, spaceSeparated=true, escape=true, format="%s") + optional("--normal_sample_name", normal_sample_name, spaceSeparated=true, escape=true, format="%s") + conditional(force_output, "--force_output", escape=true, format="%s") + conditional(force_alleles, "--force_alleles", escape=true, format="%s") + conditional(only_passing_calls, "--only_passing_calls", escape=true, format="%s") + optional("--initial_tumor_lod", initial_tumor_lod, spaceSeparated=true, escape=true, format=initial_tumor_lodFormat) + optional("--tumor_lod", tumor_lod, spaceSeparated=true, escape=true, format=tumor_lodFormat) + optional("--fraction_contamination", fraction_contamination, spaceSeparated=true, escape=true, format=fraction_contaminationFormat) + optional("--minimum_mutation_cell_fraction", minimum_mutation_cell_fraction, spaceSeparated=true, escape=true, format=minimum_mutation_cell_fractionFormat) + optional("--normal_lod", normal_lod, spaceSeparated=true, escape=true, format=normal_lodFormat) + optional("--normal_artifact_lod", normal_artifact_lod, spaceSeparated=true, escape=true, format=normal_artifact_lodFormat) + optional("--strand_artifact_lod", strand_artifact_lod, spaceSeparated=true, escape=true, format=strand_artifact_lodFormat) + optional("--strand_artifact_power_threshold", strand_artifact_power_threshold, spaceSeparated=true, escape=true, format=strand_artifact_power_thresholdFormat) + optional("--dbsnp_normal_lod", dbsnp_normal_lod, spaceSeparated=true, escape=true, format=dbsnp_normal_lodFormat) + optional("--somatic_classification_normal_power_threshold", somatic_classification_normal_power_threshold, spaceSeparated=true, escape=true, format=somatic_classification_normal_power_thresholdFormat) + optional("--minimum_normal_allele_fraction", minimum_normal_allele_fraction, spaceSeparated=true, escape=true, format=minimum_normal_allele_fractionFormat) + optional("--tumor_f_pretest", tumor_f_pretest, spaceSeparated=true, escape=true, format=tumor_f_pretestFormat) + optional("--min_qscore", min_qscore, spaceSeparated=true, escape=true, format="%s") + optional("--gap_events_threshold", gap_events_threshold, spaceSeparated=true, escape=true, format="%s") + optional("--heavily_clipped_read_fraction", heavily_clipped_read_fraction, spaceSeparated=true, escape=true, format=heavily_clipped_read_fractionFormat) + optional("--clipping_bias_pvalue_threshold", clipping_bias_pvalue_threshold, spaceSeparated=true, escape=true, format=clipping_bias_pvalue_thresholdFormat) + optional("--fraction_mapq0_threshold", fraction_mapq0_threshold, spaceSeparated=true, escape=true, format=fraction_mapq0_thresholdFormat) + optional("--pir_median_threshold", pir_median_threshold, spaceSeparated=true, escape=true, format=pir_median_thresholdFormat) + optional("--pir_mad_threshold", pir_mad_threshold, spaceSeparated=true, escape=true, format=pir_mad_thresholdFormat) + optional("--required_maximum_alt_allele_mapping_quality_score", required_maximum_alt_allele_mapping_quality_score, spaceSeparated=true, escape=true, format="%s") + optional("--max_alt_alleles_in_normal_count", max_alt_alleles_in_normal_count, spaceSeparated=true, escape=true, format="%s") + optional("--max_alt_alleles_in_normal_qscore_sum", max_alt_alleles_in_normal_qscore_sum, spaceSeparated=true, escape=true, format="%s") + optional("--max_alt_allele_in_normal_fraction", max_alt_allele_in_normal_fraction, spaceSeparated=true, escape=true, format=max_alt_allele_in_normal_fractionFormat) + optional("--power_constant_qscore", power_constant_qscore, spaceSeparated=true, escape=true, format="%s") + optional("--absolute_copy_number_data", absolute_copy_number_data, spaceSeparated=true, escape=true, format="%s") + optional("--power_constant_af", power_constant_af, spaceSeparated=true, escape=true, format=power_constant_afFormat) + optional("-o", out, spaceSeparated=true, escape=true, format="%s") + optional("-vcf", vcf, spaceSeparated=true, escape=true, format="%s") + conditional(no_cmdline_in_header, "-no_cmdline_in_header", escape=true, format="%s") + conditional(sites_only, "-sites_only", escape=true, format="%s") + conditional(bcf, "-bcf", escape=true, format="%s") + repeat("-dbsnp", dbsnp, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format="%s") + repeat("-cosmic", cosmic, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format="%s") + repeat("-normal_panel", normal_panel, formatPrefix=TaggedFile.formatCommandLineParameter, spaceSeparated=true, escape=true, format="%s") + optional("-cov", coverage_file, spaceSeparated=true, escape=true, format="%s") + optional("-cov_q20", coverage_20_q20_file, spaceSeparated=true, escape=true, format="%s") + optional("-pow", power_file, spaceSeparated=true, escape=true, format="%s") + optional("-tdf", tumor_depth_file, spaceSeparated=true, escape=true, format="%s") + optional("-ndf", normal_depth_file, spaceSeparated=true, escape=true, format="%s") + conditional(filter_mismatching_base_and_quals, "-filterMBQ", escape=true, format="%s") +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala index 6cd4b06bc..a59f273ad 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/BamGatherFunction.scala @@ -26,19 +26,20 @@ package org.broadinstitute.sting.queue.extensions.gatk import org.broadinstitute.sting.queue.function.scattergather.GatherFunction import org.broadinstitute.sting.queue.extensions.picard.PicardBamFunction -import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction} import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterArgumentTypeDescriptor +import org.broadinstitute.sting.queue.util.ClassFieldCache /** * Merges BAM files using net.sf.picard.sam.MergeSamFiles. */ -class BamGatherFunction extends GatherFunction with PicardBamFunction { +class BamGatherFunction extends GatherFunction with PicardBamFunction with RetryMemoryLimit { this.javaMainClass = "net.sf.picard.sam.MergeSamFiles" this.assumeSorted = Some(true) protected def inputBams = gatherParts protected def outputBam = originalOutput - override def freezeFieldValues { + override def freezeFieldValues() { val originalGATK = originalFunction.asInstanceOf[CommandLineGATK] // Whatever the original function can handle, merging *should* do less. @@ -47,13 +48,13 @@ class BamGatherFunction extends GatherFunction with PicardBamFunction { // bam_compression and index_output_bam_on_the_fly from SAMFileWriterArgumentTypeDescriptor // are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK - val compression = QFunction.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.COMPRESSION_FULLNAME) + val compression = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.COMPRESSION_FULLNAME) this.compressionLevel = originalGATK.getFieldValue(compression).asInstanceOf[Option[Int]] - val disableIndex = QFunction.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME) + val disableIndex = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.DISABLE_INDEXING_FULLNAME) this.createIndex = Some(!originalGATK.getFieldValue(disableIndex).asInstanceOf[Boolean]) - val enableMD5 = QFunction.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.ENABLE_MD5_FULLNAME) + val enableMD5 = ClassFieldCache.findField(originalFunction.getClass, SAMFileWriterArgumentTypeDescriptor.ENABLE_MD5_FULLNAME) this.createMD5 = Some(originalGATK.getFieldValue(enableMD5).asInstanceOf[Boolean]) super.freezeFieldValues() diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala index e619c0a02..395a34c60 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/GATKIntervals.scala @@ -92,6 +92,6 @@ object GATKIntervals { } private def createBinding(interval: String, argumentName: String, tags: Tags): IntervalBinding[Feature] = { - ArgumentTypeDescriptor.parseBinding(interval, classOf[Feature], classOf[IntervalBinding[Feature]], argumentName, tags, argumentName).asInstanceOf[IntervalBinding[Feature]] + ArgumentTypeDescriptor.parseBinding(new ArgumentMatchStringValue(interval), classOf[Feature], classOf[IntervalBinding[Feature]], argumentName, tags, argumentName).asInstanceOf[IntervalBinding[Feature]] } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala index 739e6cc91..fb22554f0 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala @@ -25,13 +25,14 @@ package org.broadinstitute.sting.queue.extensions.gatk import org.broadinstitute.sting.queue.function.scattergather.GatherFunction -import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.queue.function.{RetryMemoryLimit, QFunction} import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor +import org.broadinstitute.sting.queue.util.ClassFieldCache /** * Merges a vcf text file. */ -class VcfGatherFunction extends CombineVariants with GatherFunction { +class VcfGatherFunction extends CombineVariants with GatherFunction with RetryMemoryLimit { this.assumeIdenticalSamples = true this.suppressCommandLineHeader = true @@ -46,10 +47,10 @@ class VcfGatherFunction extends CombineVariants with GatherFunction { // NO_HEADER and sites_only from VCFWriterArgumentTypeDescriptor // are added by the GATKExtensionsGenerator to the subclass of CommandLineGATK - val noHeader = QFunction.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.NO_HEADER_ARG_NAME) + val noHeader = ClassFieldCache.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.NO_HEADER_ARG_NAME) this.no_cmdline_in_header = originalGATK.getFieldValue(noHeader).asInstanceOf[Boolean] - val sitesOnly = QFunction.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.SITES_ONLY_ARG_NAME) + val sitesOnly = ClassFieldCache.findField(originalFunction.getClass, VCFWriterArgumentTypeDescriptor.SITES_ONLY_ARG_NAME) this.sites_only = originalGATK.getFieldValue(sitesOnly).asInstanceOf[Boolean] // ensure that the gather function receives the same unsafe parameter as the scattered function diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala new file mode 100644 index 000000000..75e9300dc --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CalculateHsMetrics.scala @@ -0,0 +1,60 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline.{Argument, Output, Input} +import java.io.File + +/** + * Created with IntelliJ IDEA. + * User: delangel + * Date: 10/9/12 + * Time: 5:59 PM + * To change this template use File | Settings | File Templates. + */ +class CalculateHsMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "CalculateHsMetrics" + javaMainClass = "net.sf.picard.sam.CalculateHsMetrics" + + @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) + var input: Seq[File] = Nil + + @Output(doc="The output file to write statistics to", shortName = "output", fullName = "output_file", required = true) + var output: File = _ + + @Argument(doc="Interval list with targets", shortName = "targets", fullName = "target_list", required = true) + var targets: File = _ + + @Argument(doc="Interval list with baits", shortName = "baits", fullName = "bait_list", required = true) + var baits: File = _ + + @Argument(doc="Reference file", shortName = "reference", fullName = "reference", required = true) + var reference: File = _ + /* + @Argument(doc = "Maximum number of file handles to keep open when spilling read ends to disk. Set this number a little lower than the per-process maximum number of file that may be open. This number can be found by executing the 'ulimit -n' command on a Unix system.", shortName = "max_file_handles", fullName ="max_file_handles_for_read_ends_maps", required=false) + var MAX_FILE_HANDLES_FOR_READ_ENDS_MAP: Int = -1; + + @Argument(doc = "This number, plus the maximum RAM available to the JVM, determine the memory footprint used by some of the sorting collections. If you are running out of memory, try reducing this number.", shortName = "sorting_ratio", fullName = "sorting_collection_size_ratio", required = false) + var SORTING_COLLECTION_SIZE_RATIO: Double = -1 + */ + override def freezeFieldValues() { + super.freezeFieldValues() +// if (outputIndex == null && output != null) + // outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + } + + val level = "SAMPLE" + + override def inputBams = input + override def outputBam = output + //this.sortOrder = null + //this.createIndex = Some(true) + override def commandLine = super.commandLine + + required("BAIT_INTERVALS=" + baits) + + required("TARGET_INTERVALS=" + targets) + + required("REFERENCE_SEQUENCE=" + reference) + + optional("METRIC_ACCUMULATION_LEVEL="+level)/*+ + conditional(REMOVE_DUPLICATES, "REMOVE_DUPLICATES=true") + + conditional(MAX_FILE_HANDLES_FOR_READ_ENDS_MAP > 0, "MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=" + MAX_FILE_HANDLES_FOR_READ_ENDS_MAP.toString) + + conditional(SORTING_COLLECTION_SIZE_RATIO > 0, "SORTING_COLLECTION_SIZE_RATIO=" + SORTING_COLLECTION_SIZE_RATIO.toString) */ + + +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala new file mode 100644 index 000000000..de2b0af9e --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectGcBiasMetrics.scala @@ -0,0 +1,32 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline.{Argument, Output, Input} +import java.io.File + +/** + * Created with IntelliJ IDEA. + * User: delangel + * Date: 10/10/12 + * Time: 10:37 AM + * To change this template use File | Settings | File Templates. + */ +class CollectGcBiasMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction { + analysisName = "CalculateGcMetrics" + javaMainClass = "net.sf.picard.sam.CalculateGcMetrics" + + @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) + var input: Seq[File] = Nil + + @Output(doc="The output file to write statistics to", shortName = "output", fullName = "output_file", required = true) + var output: File = _ + + @Argument(doc="Reference file", shortName = "reference", fullName = "reference", required = true) + var reference: File = _ + + override def inputBams = input + override def outputBam = output + override def commandLine = super.commandLine + + required("CHART_OUTPUT=" + output+".pdf") + + required("REFERENCE_SEQUENCE=" + reference) + + required("ASSUME_SORTED=true") +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala new file mode 100644 index 000000000..a9af4e858 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/CollectMultipleMetrics.scala @@ -0,0 +1,36 @@ +package org.broadinstitute.sting.queue.extensions.picard + +import org.broadinstitute.sting.commandline.{Argument, Output, Input} +import java.io.File + +/** + * Created with IntelliJ IDEA. + * User: delangel + * Date: 10/10/12 + * Time: 10:37 AM + * To change this template use File | Settings | File Templates. + */ +class CollectMultipleMetrics extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction{ + analysisName = "CalculateMultipleMetrics" + javaMainClass = "net.sf.picard.sam.CalculateMultipleMetrics" + + @Input(doc="The input SAM or BAM files to analyze. Must be coordinate sorted.", shortName = "input", fullName = "input_bam_files", required = true) + var input: Seq[File] = Nil + + @Output(doc="The output file to write statistics to", shortName = "output", fullName = "output_file", required = true) + var output: File = _ + + @Argument(doc="Reference file", shortName = "reference", fullName = "reference", required = true) + var reference: File = _ + + override def inputBams = input + override def outputBam = output + override def commandLine = super.commandLine + + required("REFERENCE_SEQUENCE=" + reference) + + required("ASSUME_SORTED=true") + + required("PROGRAM=QualityScoreDistribution") + + required("PROGRAM=MeanQualityByCycle") + + required("PROGRAM=CollectAlignmentSummaryMetrics" ) + + +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala index 9257cc7c2..b22bb2b59 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SortSam.scala @@ -50,7 +50,7 @@ class SortSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFun override def freezeFieldValues() { super.freezeFieldValues() if (outputIndex == null && output != null) - outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai") + outputIndex = new File(output.getAbsolutePath.stripSuffix(".bam") + ".bai") } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index 84b625760..eb426d301 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -25,6 +25,7 @@ package org.broadinstitute.sting.queue.function import org.broadinstitute.sting.queue.util._ +import org.broadinstitute.sting.commandline.Argument /** * A command line that will be run in a pipeline. @@ -33,12 +34,15 @@ trait CommandLineFunction extends QFunction with Logging { def commandLine: String /** Upper memory limit */ + @Argument(doc="Memory limit", required=false) var memoryLimit: Option[Double] = None /** Resident memory limit */ + @Argument(doc="Resident memory limit", required=false) var residentLimit: Option[Double] = None /** Resident memory request */ + @Argument(doc="Resident memory request", required=false) var residentRequest: Option[Double] = None /** the number of SMP cores this job wants */ diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala index b9cb8540f..6500360c0 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/JavaCommandLineFunction.scala @@ -47,6 +47,7 @@ trait JavaCommandLineFunction extends CommandLineFunction { /** * Memory limit for the java executable, or if None will use the default memoryLimit. */ + @Argument(doc="Java memory limit", required=false) var javaMemoryLimit: Option[Double] = None /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala index 9f7932d39..3849b976a 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/QFunction.scala @@ -28,7 +28,6 @@ import java.io.File import java.lang.annotation.Annotation import org.broadinstitute.sting.commandline._ import org.broadinstitute.sting.queue.{QException, QSettings} -import collection.JavaConversions._ import java.lang.IllegalStateException import org.broadinstitute.sting.queue.util._ import org.broadinstitute.sting.utils.io.IOUtils @@ -113,11 +112,13 @@ trait QFunction extends Logging with QJobReport { var jobErrorFile: File = _ /** Errors (if any) from the last failed run of jobErrorFiles. */ + @Argument(doc="Job error lines", required=false) var jobErrorLines: Seq[String] = Nil /** * The number of times this function has previously been run. */ + @Argument(doc="Job retries", required=false) var retries = 0 /** Change settings for the next run. Retries will be set to the number of times the function was run and jobErrorLines may contain the error text. */ @@ -192,13 +193,13 @@ trait QFunction extends Logging with QJobReport { def failOutputs: Seq[File] = statusPrefixes.map(path => new File(path + ".fail")) /** The complete list of fields on this CommandLineFunction. */ - def functionFields = QFunction.classFields(this.functionFieldClass).functionFields + def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.functionFieldClass) /** The @Input fields on this CommandLineFunction. */ - def inputFields = QFunction.classFields(this.functionFieldClass).inputFields + def inputFields: Seq[ArgumentSource] = ClassFieldCache.classInputFields(this.functionFieldClass) /** The @Output fields on this CommandLineFunction. */ - def outputFields = QFunction.classFields(this.functionFieldClass).outputFields + def outputFields: Seq[ArgumentSource] = ClassFieldCache.classOutputFields(this.functionFieldClass) /** The @Argument fields on this CommandLineFunction. */ - def argumentFields = QFunction.classFields(this.functionFieldClass).argumentFields + def argumentFields: Seq[ArgumentSource] = ClassFieldCache.classArgumentFields(this.functionFieldClass) /** * Returns the class that should be used for looking up fields. @@ -473,72 +474,12 @@ trait QFunction extends Logging with QJobReport { * @param source Field to get the value for. * @return value of the field. */ - def getFieldValue(source: ArgumentSource) = ReflectionUtils.getValue(invokeObj(source), source.field) + def getFieldValue(source: ArgumentSource) = ClassFieldCache.getFieldValue(this, source) /** * Gets the value of a field. * @param source Field to set the value for. * @return value of the field. */ - def setFieldValue(source: ArgumentSource, value: Any) = ReflectionUtils.setValue(invokeObj(source), source.field, value) - - /** - * Walks gets the fields in this object or any collections in that object - * recursively to find the object holding the field to be retrieved or set. - * @param source Field find the invoke object for. - * @return Object to invoke the field on. - */ - private def invokeObj(source: ArgumentSource) = source.parentFields.foldLeft[AnyRef](this)(ReflectionUtils.getValue(_, _)) -} - -object QFunction { - var parsingEngine: ParsingEngine = _ - - /** - * The list of fields defined on a class - * @param clazz The class to lookup fields. - */ - private class ClassFields(clazz: Class[_]) { - /** The complete list of fields on this CommandLineFunction. */ - val functionFields: Seq[ArgumentSource] = parsingEngine.extractArgumentSources(clazz).toSeq - /** The @Input fields on this CommandLineFunction. */ - val inputFields = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Input])) - /** The @Output fields on this CommandLineFunction. */ - val outputFields = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Output])) - /** The @Argument fields on this CommandLineFunction. */ - val argumentFields = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Argument])) - } - - /** - * The mapping from class to fields. - */ - private var classFieldsMap = Map.empty[Class[_], ClassFields] - - /** - * Returns the field on clazz. - * @param clazz Class to search. - * @param name Name of the field to return. - * @return Argument source for the field. - */ - def findField(clazz: Class[_], name: String) = { - classFields(clazz).functionFields.find(_.field.getName == name) match { - case Some(source) => source - case None => throw new QException("Could not find a field on class %s with name %s".format(clazz, name)) - } - } - - /** - * Returns the fields for a class. - * @param clazz Class to retrieve fields for. - * @return the fields for the class. - */ - private def classFields(clazz: Class[_]) = { - classFieldsMap.get(clazz) match { - case Some(classFields) => classFields - case None => - val classFields = new ClassFields(clazz) - classFieldsMap += clazz -> classFields - classFields - } - } + def setFieldValue(source: ArgumentSource, value: Any) = ClassFieldCache.setFieldValue(this, source, value) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala b/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala index 8bba5551f..acc9a7203 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/RetryMemoryLimit.scala @@ -24,17 +24,26 @@ package org.broadinstitute.sting.queue.function +import org.broadinstitute.sting.commandline.Argument + +object RetryMemoryLimit { + private val defaultRetryMemoryFunction: (Double => Double) = ( 2 * _ ) + private val defaultMemoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT") +} + /** A mixin that on retry increases the memory limit when certain text is found. */ trait RetryMemoryLimit extends CommandLineFunction { /** How to increase the memory. By default doubles the memory. */ - var retryMemoryFunction: (Double => Double) = (2 * _) + var retryMemoryFunction: (Double => Double) = RetryMemoryLimit.defaultRetryMemoryFunction /** Once the threshold is passed, no more memory will be added to memory limit. */ + @Argument(doc="threshold to stop doubling the memory", required=false) var memoryLimitThreshold: Option[Double] = None /** Various strings to look for to determine we ran out of memory. */ - var memoryLimitErrorText = Seq("OutOfMemory", "you did not provide enough memory", "TERM_MEMLIMIT") + @Argument(doc="text to look for in the errors", required = false) + var memoryLimitErrorText = RetryMemoryLimit.defaultMemoryLimitErrorText override def freezeFieldValues() { super.freezeFieldValues() @@ -42,6 +51,21 @@ trait RetryMemoryLimit extends CommandLineFunction { this.memoryLimitThreshold = this.qSettings.memoryLimitThreshold } + + override def copySettingsTo(function: QFunction) { + super.copySettingsTo(function) + function match { + case retryMemoryLimit: RetryMemoryLimit => + if (retryMemoryLimit.memoryLimitThreshold.isEmpty) + retryMemoryLimit.memoryLimitThreshold = this.memoryLimitThreshold + if (retryMemoryLimit.retryMemoryFunction == RetryMemoryLimit.defaultRetryMemoryFunction) + retryMemoryLimit.retryMemoryFunction = this.retryMemoryFunction + if (retryMemoryLimit.memoryLimitErrorText == RetryMemoryLimit.defaultMemoryLimitErrorText) + retryMemoryLimit.memoryLimitErrorText = this.memoryLimitErrorText + case _ => /* ignore */ + } + } + override def setupRetry() { super.setupRetry() if (this.memoryLimitThreshold.isDefined && this.memoryLimit.isDefined) { diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala index 5b4f2b7e6..91cacbb71 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/CloneFunction.scala @@ -25,11 +25,16 @@ package org.broadinstitute.sting.queue.function.scattergather import org.broadinstitute.sting.commandline.ArgumentSource -import org.broadinstitute.sting.queue.function.{QFunction, CommandLineFunction} +import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.queue.util.ClassFieldCache /** * Shadow clones another command line function. */ +object CloneFunction { + private lazy val cloneFunctionFields = ClassFieldCache.classFunctionFields(classOf[CloneFunction]) +} + class CloneFunction extends CommandLineFunction { var originalFunction: ScatterGatherableFunction = _ var cloneIndex: Int = _ @@ -41,10 +46,10 @@ class CloneFunction extends CommandLineFunction { var originalValues = Map.empty[ArgumentSource, Any] withScatterPartCount += 1 if (withScatterPartCount == 1) { - overriddenFields.foreach{ - case (field, overrideValue) => { + originalFunction.functionFields.foreach { + case (field) => { originalValues += field -> originalFunction.getFieldValue(field) - originalFunction.setFieldValue(field, overrideValue) + originalFunction.setFieldValue(field, getFieldValue(field)) } } } @@ -52,9 +57,11 @@ class CloneFunction extends CommandLineFunction { f() } finally { if (withScatterPartCount == 1) { - originalValues.foreach{ - case (name, value) => - originalFunction.setFieldValue(name, value) + originalFunction.functionFields.foreach { + case (field) => { + setFieldValue(field, originalFunction.getFieldValue(field)) + originalFunction.setFieldValue(field, originalValues(field)) + } } } withScatterPartCount -= 1 @@ -63,28 +70,36 @@ class CloneFunction extends CommandLineFunction { override def description = withScatterPart(() => originalFunction.description) override def shortDescription = withScatterPart(() => originalFunction.shortDescription) + override def setupRetry() { withScatterPart(() => originalFunction.setupRetry()) } + override protected def functionFieldClass = originalFunction.getClass def commandLine = withScatterPart(() => originalFunction.commandLine) def getFieldValue(field: String): AnyRef = { - val source = QFunction.findField(originalFunction.getClass, field) + val source = ClassFieldCache.findField(originalFunction.getClass, field) getFieldValue(source) } override def getFieldValue(source: ArgumentSource): AnyRef = { - overriddenFields.get(source) match { - case Some(value) => value.asInstanceOf[AnyRef] - case None => { - val value = originalFunction.getFieldValue(source) - overriddenFields += source -> value - value - } + CloneFunction.cloneFunctionFields.find(_.field.getName == source.field.getName) match { + case Some(cloneSource) => + super.getFieldValue(cloneSource) + case None => + overriddenFields.get(source) match { + case Some(value) => + value.asInstanceOf[AnyRef] + case None => { + val value = originalFunction.getFieldValue(source) + overriddenFields += source -> value + value + } + } } } def setFieldValue(field: String, value: Any) { - val source = QFunction.findField(originalFunction.getClass, field) + val source = ClassFieldCache.findField(originalFunction.getClass, field) setFieldValue(source, value) } diff --git a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala index 4578f0e82..5dd7d4c79 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/function/scattergather/ScatterGatherableFunction.scala @@ -91,7 +91,7 @@ trait ScatterGatherableFunction extends CommandLineFunction { if (qSettings.jobScatterGatherDirectory != null) { this.scatterGatherDirectory = IOUtils.absolute(qSettings.jobScatterGatherDirectory) } else { - this.scatterGatherDirectory = IOUtils.absolute(this.commandDirectory, "queueScatterGather") + this.scatterGatherDirectory = IOUtils.absolute(this.commandDirectory, ".queue/scatterGather") } } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala new file mode 100644 index 000000000..870dd5617 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ClassFieldCache.scala @@ -0,0 +1,183 @@ +package org.broadinstitute.sting.queue.util + +import org.broadinstitute.sting.commandline._ +import scala.Some +import org.broadinstitute.sting.queue.QException +import collection.JavaConversions._ +import java.io.File + +/** + * Utilities and a static cache of argument fields for various classes populated by the parsingEngine. + * Because this class works with the ParsingEngine it can walk @ArgumentCollection hierarchies. + */ +object ClassFieldCache { + var parsingEngine: ParsingEngine = _ + + + // + // Field caching + // + + /** + * The list of fields defined on a class + * @param clazz The class to lookup fields. + */ + private class ClassFields(clazz: Class[_]) { + /** The complete list of fields on this CommandLineFunction. */ + val functionFields: Seq[ArgumentSource] = parsingEngine.extractArgumentSources(clazz).toSeq + /** The @Input fields on this CommandLineFunction. */ + val inputFields: Seq[ArgumentSource] = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Input])) + /** The @Output fields on this CommandLineFunction. */ + val outputFields: Seq[ArgumentSource] = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Output])) + /** The @Argument fields on this CommandLineFunction. */ + val argumentFields: Seq[ArgumentSource] = functionFields.filter(source => ReflectionUtils.hasAnnotation(source.field, classOf[Argument])) + } + + /** + * The mapping from class to fields. + */ + private var classFieldsMap = Map.empty[Class[_], ClassFields] + + /** + * Returns the fields for a class. + * @param clazz Class to retrieve fields for. + * @return the fields for the class. + */ + private def classFields(clazz: Class[_]): ClassFields = { + classFieldsMap.get(clazz) match { + case Some(classFields) => classFields + case None => + val classFields = new ClassFields(clazz) + classFieldsMap += clazz -> classFields + classFields + } + } + + /** + * Returns the field on clazz. + * @param clazz Class to search. + * @param name Name of the field to return. + * @return Argument source for the field. + */ + def findField(clazz: Class[_], name: String): ArgumentSource = { + classFields(clazz).functionFields.find(_.field.getName == name) match { + case Some(source) => source + case None => throw new QException("Could not find a field on class %s with name %s".format(clazz, name)) + } + } + + /** + * Returns the Seq of fields for a QFunction class. + * @param clazz Class to retrieve fields for. + * @return the fields of the class. + */ + def classFunctionFields(clazz: Class[_]): Seq[ArgumentSource] = classFields(clazz).functionFields + + /** + * Returns the Seq of inputs for a QFunction class. + * @param clazz Class to retrieve inputs for. + * @return the inputs of the class. + */ + def classInputFields(clazz: Class[_]): Seq[ArgumentSource] = classFields(clazz).inputFields + + /** + * Returns the Seq of outputs for a QFunction class. + * @param clazz Class to retrieve outputs for. + * @return the outputs of the class. + */ + def classOutputFields(clazz: Class[_]): Seq[ArgumentSource] = classFields(clazz).outputFields + + /** + * Returns the Seq of arguments for a QFunction class. + * @param clazz Class to retrieve arguments for. + * @return the arguments of the class. + */ + def classArgumentFields(clazz: Class[_]): Seq[ArgumentSource] = classFields(clazz).argumentFields + + + // + // get/set fields as AnyRef + // + + /** + * Gets the value of a field. + * @param obj Top level object storing the source info. + * @param source Field to get the value for. + * @return value of the field. + */ + def getFieldValue(obj: AnyRef, source: ArgumentSource) = ReflectionUtils.getValue(invokeObj(obj, source), source.field) + + /** + * Gets the value of a field. + * @param obj Top level object storing the source info. + * @param source Field to set the value for. + * @return value of the field. + */ + def setFieldValue(obj: AnyRef, source: ArgumentSource, value: Any) = ReflectionUtils.setValue(invokeObj(obj, source), source.field, value) + + /** + * Walks gets the fields in this object or any collections in that object + * recursively to find the object holding the field to be retrieved or set. + * @param obj Top level object storing the source info. + * @param source Field find the invoke object for. + * @return Object to invoke the field on. + */ + private def invokeObj(obj: AnyRef, source: ArgumentSource) = source.parentFields.foldLeft[AnyRef](obj)(ReflectionUtils.getValue(_, _)) + + + // + // get/set fields as java.io.File + // + + /** + * Gets the files from the fields. The fields must be a File, a FileExtension, or a Seq or Set of either. + * @param obj Top level object storing the source info. + * @param fields Fields to get files. + * @return for the fields. + */ + def getFieldFiles(obj: AnyRef, fields: Seq[ArgumentSource]): Seq[File] = { + var files: Seq[File] = Nil + for (field <- fields) + files ++= getFieldFiles(obj, field) + files.distinct + } + + /** + * Gets the files from the field. The field must be a File, a FileExtension, or a Seq or Set of either. + * @param obj Top level object storing the source info. + * @param field Field to get files. + * @return for the field. + */ + def getFieldFiles(obj: AnyRef, field: ArgumentSource): Seq[File] = { + var files: Seq[File] = Nil + CollectionUtils.foreach(getFieldValue(obj, field), (fieldValue) => { + val file = fieldValueToFile(field, fieldValue) + if (file != null) + files :+= file + }) + files.distinct + } + + /** + * Gets the file from the field. The field must be a File or a FileExtension and not a Seq or Set. + * @param obj Top level object storing the source info. + * @param field Field to get the file. + * @return for the field. + */ + def getFieldFile(obj: AnyRef, field: ArgumentSource): File = + fieldValueToFile(field, getFieldValue(obj, field)) + + /** + * Converts the field value to a file. The field must be a File or a FileExtension. + * @param field Field to get the file. + * @param value Value of the File or FileExtension or null. + * @return Null if value is null, otherwise the File. + * @throws QException if the value is not a File or FileExtension. + */ + private def fieldValueToFile(field: ArgumentSource, value: Any): File = value match { + case file: File => file + case null => null + case unknown => throw new QException("Non-file found. Try removing the annotation, change the annotation to @Argument, or extend File with FileExtension: %s: %s".format(field.field, unknown)) + } + +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala index c69a310b3..0600f9ad5 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobReport.scala @@ -25,13 +25,8 @@ package org.broadinstitute.sting.queue.util import org.broadinstitute.sting.queue.function.QFunction -import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} -import org.broadinstitute.sting.utils.exceptions.UserException +import org.broadinstitute.sting.gatk.report.GATKReportTable import org.broadinstitute.sting.queue.engine.JobRunInfo -import java.io.{PrintStream, File} -import org.broadinstitute.sting.utils.R.{RScriptLibrary, RScriptExecutor} -import org.broadinstitute.sting.utils.io.Resource -import org.apache.commons.io.{IOUtils, FileUtils} /** * A mixin to add Job info to the class @@ -98,31 +93,10 @@ trait QJobReport extends Logging { } object QJobReport { - val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R" - // todo -- fixme to have a unique name for Scatter/gather jobs as well var seenCounter = 1 var seenNames = Set[String]() - def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { - val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) - jobs foreach {case (qf, info) => qf.setRunInfo(info)} - val stream = new PrintStream(FileUtils.openOutputStream(dest)) - try { - printJobLogging(jobs.keys.toSeq, stream) - } finally { - IOUtils.closeQuietly(stream) - } - } - - def plotReport(reportFile: File, pdfFile: File) { - val executor = new RScriptExecutor - executor.addLibrary(RScriptLibrary.GSALIB) - executor.addScript(new Resource(JOB_REPORT_QUEUE_SCRIPT, classOf[QJobReport])) - executor.addArgs(reportFile.getAbsolutePath, pdfFile.getAbsolutePath) - executor.exec() - } - def workAroundSameJobNames(func: QFunction):String = { if ( seenNames.apply(func.jobName) ) { seenCounter += 1 @@ -132,45 +106,4 @@ object QJobReport { func.jobName } } - - /** - * Prints the JobLogging logs to a GATKReport. First splits up the - * logs by group, and for each group generates a GATKReportTable - */ - private def printJobLogging(logs: Seq[QFunction], stream: PrintStream) { - // create the report - val report: GATKReport = new GATKReport - - // create a table for each group of logs - for ( (group, groupLogs) <- groupLogs(logs) ) { - val keys = logKeys(groupLogs) - report.addTable(group, "Job logs for " + group, keys.size) - val table: GATKReportTable = report.getTable(group) - - // add the columns - keys.foreach(table.addColumn(_)) - for (log <- groupLogs) { - for ( key <- keys ) - table.set(log.getReportName, key, log.getReportFeature(key)) - } - } - - report.print(stream) - } - - private def groupLogs(logs: Seq[QFunction]): Map[String, Seq[QFunction]] = { - logs.groupBy(_.getReportGroup) - } - - private def logKeys(logs: Seq[QFunction]): Set[String] = { - // the keys should be the same for each log, but we will check that - val keys = Set[String](logs(0).getReportFeatureNames : _*) - - for ( log <- logs ) - if ( keys.sameElements(Set(log.getReportFeatureNames)) ) - throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " + - "We found one with %s and another with %s").format(keys, log.getReportFeatureNames)) - - keys - } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala new file mode 100644 index 000000000..a23fe4485 --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QJobsReporter.scala @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.queue.util + +import java.io.{PrintStream, File} +import org.broadinstitute.sting.utils.io.{Resource} +import org.broadinstitute.sting.queue.engine.{JobRunInfo, QGraph} +import org.broadinstitute.sting.queue.function.QFunction +import org.broadinstitute.sting.utils.R.{RScriptLibrary, RScriptExecutor} +import org.broadinstitute.sting.gatk.report.{GATKReportTable, GATKReport} +import org.broadinstitute.sting.utils.exceptions.UserException +import org.apache.commons.io.{FileUtils, IOUtils} + +/** + * Writes out RunInfo to a GATKReport + */ +class QJobsReporter(val disabled: Boolean, val reportFile: File, val pdfFile: Option[File]) extends Logging { + private val JOB_REPORT_QUEUE_SCRIPT = "queueJobReport.R" + + /** + * Write out a job report based on the finished jobs graph + * @param jobGraph + * @param enabledPlotting if true, we will plot the report as well with the JOB_REPORT_QUEUE_SCRIPT + */ + def write(jobGraph: QGraph, enabledPlotting: Boolean) { + if ( ! disabled ) { + logger.info("Writing JobLogging GATKReport to file " + reportFile) + printReport(jobGraph.getFunctionsAndStatus, reportFile) + + if ( enabledPlotting ) + pdfFile match { + case Some(file) => + logger.info("Plotting JobLogging GATKReport to file " + file) + plotReport(reportFile, file) + case None => + } + } + } + + private def printReport(jobsRaw: Map[QFunction, JobRunInfo], dest: File) { + val jobs = jobsRaw.filter(_._2.isFilledIn).filter(_._1.includeInReport) + jobs foreach {case (qf, info) => qf.setRunInfo(info)} + val stream = new PrintStream(FileUtils.openOutputStream(dest)) + try { + printJobLogging(jobs.keys.toSeq, stream) + } finally { + IOUtils.closeQuietly(stream) + } + } + + private def plotReport(reportFile: File, pdfFile: File) { + val executor = new RScriptExecutor + executor.addLibrary(RScriptLibrary.GSALIB) + executor.addScript(new Resource(JOB_REPORT_QUEUE_SCRIPT, classOf[QJobReport])) + executor.addArgs(reportFile.getAbsolutePath, pdfFile.getAbsolutePath) + executor.exec() + } + + /** + * Prints the JobLogging logs to a GATKReport. First splits up the + * logs by group, and for each group generates a GATKReportTable + */ + private def printJobLogging(logs: Seq[QFunction], stream: PrintStream) { + // create the report + val report: GATKReport = new GATKReport + + // create a table for each group of logs + for ( (group, groupLogs) <- groupLogs(logs) ) { + val keys = logKeys(groupLogs) + report.addTable(group, "Job logs for " + group, keys.size) + val table: GATKReportTable = report.getTable(group) + + // add the columns + keys.foreach(table.addColumn(_)) + for (log <- groupLogs) { + for ( key <- keys ) + table.set(log.getReportName, key, log.getReportFeature(key)) + } + } + + report.print(stream) + } + + private def groupLogs(logs: Seq[QFunction]): Map[String, Seq[QFunction]] = { + logs.groupBy(_.getReportGroup) + } + + private def logKeys(logs: Seq[QFunction]): Set[String] = { + // the keys should be the same for each log, but we will check that + val keys = Set[String](logs(0).getReportFeatureNames : _*) + + for ( log <- logs ) + if ( keys.sameElements(Set(log.getReportFeatureNames)) ) + throw new UserException(("All JobLogging jobs in the same group must have the same set of features. " + + "We found one with %s and another with %s").format(keys, log.getReportFeatureNames)) + + keys + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala index 1529d9951..f684e533f 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/QScriptUtils.scala @@ -57,7 +57,8 @@ object QScriptUtils { for (file <- fromFile(in).getLines()) if (!file.startsWith("#") && !file.isEmpty ) list :+= new File(file.trim()) - list.sortWith(_.compareTo(_) < 0) +// list.sortWith(_.compareTo(_) < 0) + list } /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala index 980a22e8e..15101fd75 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ReflectionUtils.scala @@ -159,12 +159,11 @@ object ReflectionUtils { private def getGenericTypes(field: Field): Option[Array[Class[_]]] = { // TODO: Refactor: based on java code in org.broadinstitute.sting.commandline.ArgumentTypeDescriptor // If this is a parameterized collection, find the contained type. If blow up if only one type exists. - if (field.getGenericType.isInstanceOf[ParameterizedType]) { + if (hasAnnotation(field, classOf[ClassType])) { + Some(Array(getAnnotation(field, classOf[ClassType]).value)) + } else if (field.getGenericType.isInstanceOf[ParameterizedType]) { val parameterizedType = field.getGenericType.asInstanceOf[ParameterizedType] Some(parameterizedType.getActualTypeArguments.map(_.asInstanceOf[Class[_]])) - } else if (hasAnnotation(field, classOf[ClassType])) { - Some(Array(getAnnotation(field, classOf[ClassType]).value)) - } - else None + } else None } } diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala new file mode 100644 index 000000000..9d94975ba --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/RemoteFile.scala @@ -0,0 +1,14 @@ +package org.broadinstitute.sting.queue.util + +import java.io.File +import org.broadinstitute.sting.utils.io.FileExtension + +/** + * An extension of java.io.File that can be pulled from or pushed to a remote location. + */ +trait RemoteFile extends File with FileExtension { + def pullToLocal() + def pushToRemote() + def deleteRemote() + def remoteDescription: String +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala index 0d8edc25d..54e89ec58 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/ScalaCompoundArgumentTypeDescriptor.scala @@ -28,6 +28,7 @@ import collection.JavaConversions._ import org.broadinstitute.sting.queue.QException import java.lang.Class import org.broadinstitute.sting.commandline.{ArgumentMatches, ArgumentSource, ArgumentTypeDescriptor, ParsingEngine} +import org.broadinstitute.sting.utils.exceptions.UserException import java.lang.reflect.Type /** @@ -75,6 +76,8 @@ class ScalaCompoundArgumentTypeDescriptor extends ArgumentTypeDescriptor { def parse(parsingEngine: ParsingEngine, source: ArgumentSource, classType: Class[_], argumentMatches: ArgumentMatches) = { val componentType = ReflectionUtils.getCollectionType(source.field) + if (componentType == classOf[java.lang.Object]) + throw new UserException.CannotExecuteQScript("Please also include a @ClassType(classOf[]) annotation on field: " + source.field + ". Example: @ClassType(classOf[Double]). The scala generic type for the field was subjected to java/scala type erasure and is not available via reflection.") val componentArgumentParser = parsingEngine.selectBestTypeDescriptor(componentType) if (classOf[Seq[_]].isAssignableFrom(classType)) { diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala index 3fb9e0efa..944ef7977 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -41,7 +41,7 @@ class DataProcessingPipelineTest { " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf", " -test ", " -p " + projectName).mkString - spec.fileMD5s += testOut -> "60d39ae909fdd049920b54e0965b6d3c" + spec.fileMD5s += testOut -> "45d97df6d291695b92668e8a55c54cd0" PipelineTest.executeTest(spec) } @@ -60,7 +60,7 @@ class DataProcessingPipelineTest { " -bwa /home/unix/carneiro/bin/bwa", " -bwape ", " -p " + projectName).mkString - spec.fileMD5s += testOut -> "61ca3237afdfabf78ee27a5bb80dae59" + spec.fileMD5s += testOut -> "9fca827ecc8436465b831bb6f879357a" PipelineTest.executeTest(spec) } diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala index 74e947377..3e9af3e68 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala @@ -40,7 +40,7 @@ class PacbioProcessingPipelineTest { " -blasr ", " -test ", " -D " + BaseTest.publicTestDir + "exampleDBSNP.vcf").mkString - spec.fileMD5s += testOut -> "61b06e8b78a93e6644657e6d38851084" + spec.fileMD5s += testOut -> "b84f9c45e045685067ded681d5e6224c" PipelineTest.executeTest(spec) } } diff --git a/settings/ivysettings.xml b/settings/ivysettings.xml index e17342442..ce7667140 100644 --- a/settings/ivysettings.xml +++ b/settings/ivysettings.xml @@ -7,7 +7,6 @@ -