diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java index c7b990a88..416f66ec6 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseAndQualsCounts.java @@ -69,10 +69,30 @@ public class BaseAndQualsCounts extends BaseCounts { private long sumInsertionQual_N = 0; private long sumDeletionQual_N = 0; + /* + * Increments the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + */ + public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { + // if we already have high quality bases, ignore low quality ones + if ( isLowQualBase && !isLowQuality() ) + return; + + // if this is a high quality base then remove any low quality bases and start from scratch + if ( !isLowQualBase && isLowQuality() ) { + if ( totalCount() > 0 ) + clear(); + setLowQuality(false); + } - public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { final BaseIndex i = BaseIndex.byteToBase(base); - super.incr(i, baseQual); + super.incr(i, baseQual, baseMappingQual); switch (i) { case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break; case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break; @@ -84,9 +104,23 @@ public class BaseAndQualsCounts extends BaseCounts { } } - public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) { + /* + * Decrements the count + * + * @param base the base + * @param baseQual the base quality + * @param insQual the insertion quality + * @param delQual the deletion quality + * @param baseMappingQual the mapping quality + * @param isLowQualBase true if the base is low quality + */ + public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) { + // if this is not the right type of base, ignore it + if ( isLowQualBase != isLowQuality() ) + return; + final BaseIndex i = BaseIndex.byteToBase(base); - super.decr(i, baseQual); + super.decr(i, baseQual, baseMappingQual); switch (i) { case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break; case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break; @@ -131,4 +165,13 @@ public class BaseAndQualsCounts extends BaseCounts { default: throw new IllegalArgumentException(base.name()); } } + + /** + * Clears out all stored data in this object + */ + public void clear() { + super.clear(); + sumInsertionQual_A = sumInsertionQual_C = sumInsertionQual_G = sumInsertionQual_T = sumInsertionQual_D = sumInsertionQual_I = sumInsertionQual_N = 0; + sumDeletionQual_A = sumDeletionQual_C = sumDeletionQual_G = sumDeletionQual_T = sumDeletionQual_D = sumDeletionQual_I = sumDeletionQual_N = 0; + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java index 17ce3c90d..afcaf1510 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCounts.java @@ -48,6 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import org.broadinstitute.sting.utils.MathUtils; /** @@ -78,6 +80,8 @@ import com.google.java.contract.Requires; private int count_N = 0; private int sumQual_N = 0; private int totalCount = 0; // keeps track of total count since this is requested so often + private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this + private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise public static BaseCounts createWithCounts(int[] countsACGT) { @@ -100,6 +104,7 @@ import com.google.java.contract.Requires; this.count_I += other.count_I; this.count_N += other.count_N; this.totalCount += other.totalCount; + this.mappingQualities.addAll(other.mappingQualities); } @Requires("other != null") @@ -112,6 +117,7 @@ import com.google.java.contract.Requires; this.count_I -= other.count_I; this.count_N -= other.count_N; this.totalCount -= other.totalCount; + this.mappingQualities.removeAll(other.mappingQualities); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") @@ -120,7 +126,7 @@ import com.google.java.contract.Requires; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1") - public void incr(final BaseIndex base, final byte qual) { + public void incr(final BaseIndex base, final byte qual, final int mappingQuality) { switch (base) { case A: ++count_A; sumQual_A += qual; break; case C: ++count_C; sumQual_C += qual; break; @@ -131,6 +137,7 @@ import com.google.java.contract.Requires; case N: ++count_N; sumQual_N += qual; break; } ++totalCount; + mappingQualities.add(mappingQuality); } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") @@ -152,7 +159,7 @@ import com.google.java.contract.Requires; } @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") - public void decr(final BaseIndex base, final byte qual) { + public void decr(final BaseIndex base, final byte qual, final int mappingQuality) { switch (base) { case A: --count_A; sumQual_A -= qual; break; case C: --count_C; sumQual_C -= qual; break; @@ -163,6 +170,7 @@ import com.google.java.contract.Requires; case N: --count_N; sumQual_N -= qual; break; } --totalCount; + mappingQualities.remove((Integer) mappingQuality); } @Ensures("result >= 0") @@ -229,6 +237,15 @@ import com.google.java.contract.Requires; return totalCount; } + /** + * The RMS of the mapping qualities of all reads that contributed to this object + * + * @return the RMS of the mapping qualities of all reads that contributed to this object + */ + public double getRMS() { + return MathUtils.rms(mappingQualities); + } + /** * Given a base , it returns the proportional count of this base compared to all other bases * @@ -325,4 +342,26 @@ import com.google.java.contract.Requires; final int total = totalCountWithoutIndels(); return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total; } + + /** + * @return true if this instance represents low quality bases + */ + public boolean isLowQuality() { return isLowQuality; } + + /** + * Sets the low quality value + * + * @param value true if this instance represents low quality bases false otherwise + */ + public void setLowQuality(final boolean value) { isLowQuality = value; } + + /** + * Clears out all stored data in this object + */ + public void clear() { + count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0; + sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0; + totalCount = 0; + mappingQualities.clear(); + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java index 3532a74fb..616388e8c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElement.java @@ -46,14 +46,10 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.objects.ObjectArrayList; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - /** * The element that describes the header of the sliding window. @@ -68,7 +64,6 @@ public class HeaderElement { private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right private int nSoftClippedBases; // How many bases in this site came from soft clipped bases private int location; // Genome location of this site (the sliding window knows which contig we're at - private IntArrayList mappingQuality; // keeps the mapping quality of each read that contributed to this element (site) public int getLocation() { return location; @@ -89,7 +84,7 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new IntArrayList()); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location); } /** @@ -99,7 +94,7 @@ public class HeaderElement { * @param location the reference location for the new element */ public HeaderElement(final int location, final int insertionsToTheRight) { - this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location, new IntArrayList()); + this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location); } /** @@ -110,16 +105,14 @@ public class HeaderElement { * @param insertionsToTheRight number of insertions to the right of this HeaderElement * @param nSoftClippedBases number of softclipped bases of this HeaderElement * @param location the reference location of this reference element - * @param mappingQuality the list of mapping quality values of all reads that contributed to this * HeaderElement */ - public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, IntArrayList mappingQuality) { + public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location) { this.consensusBaseCounts = consensusBaseCounts; this.filteredBaseCounts = filteredBaseCounts; this.insertionsToTheRight = insertionsToTheRight; this.nSoftClippedBases = nSoftClippedBases; this.location = location; - this.mappingQuality = mappingQuality; } /** @@ -128,35 +121,52 @@ public class HeaderElement { * * @return true if site is variant by any definition. False otherwise. */ - public boolean isVariant(double minVariantProportion, double minIndelProportion) { - return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); + public boolean isVariant(double minVariantPvalue, double minIndelProportion) { + return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips()); } /** * Adds a new base to the HeaderElement updating all counts accordingly * - * @param base the base to add + * @param base the base to add * @param baseQual the base quality + * @param insQual the base insertion quality + * @param delQual the base deletion quality * @param baseMappingQuality the mapping quality of the read this base belongs to + * @param minBaseQual the minimum base qual allowed to be a good base + * @param minMappingQual the minimum mapping qual allowed to be a good read + * @param isSoftClipped true if the base is soft-clipped in the original read */ public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { - if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual)) - consensusBaseCounts.incr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts + // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts + if ( baseMappingQuality >= minMappingQual ) + consensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); else - filteredBaseCounts.incr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts + filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - this.mappingQuality.add(baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site - nSoftClippedBases += isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter + nSoftClippedBases += isSoftClipped ? 1 : 0; } + /** + * Adds a new base to the HeaderElement updating all counts accordingly + * + * @param base the base to add + * @param baseQual the base quality + * @param insQual the base insertion quality + * @param delQual the base deletion quality + * @param baseMappingQuality the mapping quality of the read this base belongs to + * @param minBaseQual the minimum base qual allowed to be a good base + * @param minMappingQual the minimum mapping qual allowed to be a good read + * @param isSoftClipped true if the base is soft-clipped in the original read + */ public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) { - if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual)) - consensusBaseCounts.decr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts + // If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts + if ( baseMappingQuality >= minMappingQual ) + consensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); else - filteredBaseCounts.decr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts + filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual); - this.mappingQuality.remove((Integer) baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site - nSoftClippedBases -= isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter + nSoftClippedBases -= isSoftClipped ? 1 : 0; } /** * Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions @@ -193,15 +203,6 @@ public class HeaderElement { return (!hasFilteredData() && !hasConsensusData()); } - /** - * The RMS of the mapping qualities of all reads that contributed to this HeaderElement - * - * @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement - */ - public double getRMS() { - return MathUtils.rms(mappingQuality); - } - /** * removes an insertion from this element (if you removed a read that had an insertion) */ @@ -236,7 +237,7 @@ public class HeaderElement { /** * Whether or not the HeaderElement is variant due to excess deletions * - * @return whether or not the HeaderElement is variant due to excess insertions + * @return whether or not the HeaderElement is variant due to excess deletions */ private boolean isVariantFromDeletions(double minIndelProportion) { return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion; @@ -245,12 +246,15 @@ public class HeaderElement { /** * Whether or not the HeaderElement is variant due to excess mismatches * - * @return whether or not the HeaderElement is variant due to excess insertions + * @param minVariantPvalue the minimum pvalue to call a site variant. + * @return whether or not the HeaderElement is variant due to excess mismatches */ - protected boolean isVariantFromMismatches(double minVariantProportion) { - BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); - double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon); - return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion); + protected boolean isVariantFromMismatches(double minVariantPvalue) { + final int totalCount = consensusBaseCounts.totalCountWithoutIndels(); + final BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels(); + final int countOfOtherBases = totalCount - consensusBaseCounts.countOfBase(mostCommon); + final double pvalue = countOfOtherBases == 0 ? 0.0 : MathUtils.binomialCumulativeProbability(0, countOfOtherBases+1, totalCount, 0.5); + return pvalue > minVariantPvalue; } /** @@ -263,42 +267,44 @@ public class HeaderElement { return nSoftClippedBases > 0 && nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases); } - protected boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) { - return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual; - } - /** * Calculates the number of alleles necessary to represent this site. * - * @param minVariantProportion the minimum proportion to call a site variant. - * @param allowDeletions should we allow deletions? - * @return the number of alleles necessary to represent this site or -1 if allowDeletions is false and there are a sufficient number of them + * @param minVariantPvalue the minimum pvalue to call a site variant. + * @return the number of alleles necessary to represent this site or -1 if there are too many indels */ - public int getNumberOfAlleles(final double minVariantProportion, final boolean allowDeletions) { - final List alleles = getAlleles(minVariantProportion, allowDeletions); + public int getNumberOfBaseAlleles(final double minVariantPvalue) { + final ObjectArrayList alleles = getAlleles(minVariantPvalue); return alleles == null ? -1 : alleles.size(); } /** * Calculates the alleles necessary to represent this site. * - * @param minVariantProportion the minimum proportion to call a site variant. - * @param allowDeletions should we allow deletions? - * @return the list of alleles necessary to represent this site or null if allowDeletions is false and there are a sufficient number of them + * @param minVariantPvalue the minimum pvalue to call a site variant. + * @return the list of alleles necessary to represent this site or null if there are too many indels */ - public List getAlleles(final double minVariantProportion, final boolean allowDeletions) { + public ObjectArrayList getAlleles(final double minVariantPvalue) { + // make sure we have bases at all final int totalBaseCount = consensusBaseCounts.totalCount(); if ( totalBaseCount == 0 ) - return Collections.emptyList(); + return new ObjectArrayList(0); - final int minBaseCountForRelevantAlleles = Math.max(1, (int)(minVariantProportion * totalBaseCount)); + // next, check for insertions + if ( hasSignificantCount(insertionsToTheRight, minVariantPvalue) ) + return null; - final List alleles = new ArrayList(4); + // finally, check for the bases themselves (including deletions) + final ObjectArrayList alleles = new ObjectArrayList(4); for ( final BaseIndex base : BaseIndex.values() ) { final int baseCount = consensusBaseCounts.countOfBase(base); + if ( baseCount == 0 ) + continue; - if ( baseCount >= minBaseCountForRelevantAlleles ) { - if ( !allowDeletions && base == BaseIndex.D ) + final double pvalue = MathUtils.binomialCumulativeProbability(0, baseCount+1, totalBaseCount, 0.5); + + if ( pvalue > minVariantPvalue ) { + if ( base == BaseIndex.D ) return null; alleles.add(base); } @@ -309,15 +315,26 @@ public class HeaderElement { /* * Checks whether there are a significant number of softclips. * - * @param minVariantProportion the minimum proportion to consider something significant. + * @param minVariantPvalue the minimum pvalue to call a site variant. * @return true if there are significant softclips, false otherwise */ - public boolean hasSignificantSoftclips(final double minVariantProportion) { + public boolean hasSignificantSoftclips(final double minVariantPvalue) { + return hasSignificantCount(nSoftClippedBases, minVariantPvalue); + } + + /* + * Checks whether there are a significant number of count. + * + * @param count the count to test against + * @param minVariantPvalue the minimum pvalue to call a site variant. + * @return true if there is a significant count given the provided pvalue, false otherwise + */ + private boolean hasSignificantCount(final int count, final double minVariantPvalue) { final int totalBaseCount = consensusBaseCounts.totalCount(); - if ( totalBaseCount == 0 ) + if ( count == 0 || totalBaseCount == 0 ) return false; - final int minBaseCountForSignificance = Math.max(1, (int)(minVariantProportion * totalBaseCount)); - return nSoftClippedBases >= minBaseCountForSignificance; + final double pvalue = MathUtils.binomialCumulativeProbability(0, count+1, totalBaseCount, 0.5); + return pvalue > minVariantPvalue; } } \ No newline at end of file diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java index 42873964d..85aee9fc9 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/MultiSampleCompressor.java @@ -96,14 +96,14 @@ public class MultiSampleCompressor { final int contextSize, final int downsampleCoverage, final int minMappingQuality, - final double minAltProportionToTriggerVariant, + final double minAltPValueToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy) { for ( String name : SampleUtils.getSAMFileSamples(header) ) { compressorsPerSample.put(name, new SingleSampleCompressor(contextSize, downsampleCoverage, - minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + minMappingQuality, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index c9730e95a..4d90a83be 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -64,6 +64,7 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -139,19 +140,21 @@ public class ReduceReads extends ReadWalker, Redu * towards variable regions. */ @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false) - public byte minBaseQual = 20; + public byte minBaseQual = 15; /** - * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality - * lower than this threshold will be hard clipped off before entering the reduce reads algorithm. + * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases at the tails with + * quality at or lower than this threshold will be hard clipped off before entering the reduce reads algorithm. */ @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false) public byte minTailQuality = 2; /** - * Any number of VCF files representing known SNPs to be used for the experimental polyploid-based reduction. + * Any number of VCF files representing known SNPs to be used for the polyploid-based reduction. * Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored. - * Note that polyploid ("het") compression will work only when a single SNP is present in a consensus window. + * If provided, the polyploid ("het") compression will work only when a single SNP from the known set is present + * in a consensus window (otherwise there will be no reduction); if not provided then polyploid compression will + * be triggered anywhere there is a single SNP present in a consensus window. */ @Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false) public List> known = Collections.emptyList(); @@ -204,9 +207,18 @@ public class ReduceReads extends ReadWalker, Redu * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be * considered consensus. */ + @Deprecated @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) public double minAltProportionToTriggerVariant = 0.05; + /** + * Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region. + * Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to trigger polyploid compression). + */ + @Advanced + @Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false) + public double minAltPValueToTriggerVariant = 0.01; + /** * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be * considered consensus. @@ -253,7 +265,7 @@ public class ReduceReads extends ReadWalker, Redu ObjectSortedSet intervalList; - final ObjectSortedSet knownSnpPositions = new ObjectAVLTreeSet(); + ObjectSortedSet knownSnpPositions; // IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag @@ -273,6 +285,14 @@ public class ReduceReads extends ReadWalker, Redu if ( nwayout && out != null ) throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other"); + if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 ) + throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)"); + + if ( known.isEmpty() ) + knownSnpPositions = null; + else + knownSnpPositions = new ObjectAVLTreeSet(); + GenomeAnalysisEngine toolkit = getToolkit(); readNameHash = new Object2LongOpenHashMap(100000); // prepare the read name hash to keep track of what reads have had their read names compressed intervalList = new ObjectAVLTreeSet(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode @@ -392,7 +412,7 @@ public class ReduceReads extends ReadWalker, Redu */ @Override public ReduceReadsStash reduceInit() { - return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); + return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); } /** @@ -470,8 +490,8 @@ public class ReduceReads extends ReadWalker, Redu * @param read the current read, used for checking whether there are stale positions we can remove */ protected void clearStaleKnownPositions(final GATKSAMRecord read) { - // nothing to clear if empty - if ( knownSnpPositions.isEmpty() ) + // nothing to clear if not used or empty + if ( knownSnpPositions == null || knownSnpPositions.isEmpty() ) return; // not ready to be cleared until we encounter a read from a different contig diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java index db1e0baaf..ec041386c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SingleSampleCompressor.java @@ -62,7 +62,7 @@ public class SingleSampleCompressor { final private int contextSize; final private int downsampleCoverage; final private int minMappingQuality; - final private double minAltProportionToTriggerVariant; + final private double minAltPValueToTriggerVariant; final private double minIndelProportionToTriggerVariant; final private int minBaseQual; final private ReduceReads.DownsampleStrategy downsampleStrategy; @@ -75,7 +75,7 @@ public class SingleSampleCompressor { public SingleSampleCompressor(final int contextSize, final int downsampleCoverage, final int minMappingQuality, - final double minAltProportionToTriggerVariant, + final double minAltPValueToTriggerVariant, final double minIndelProportionToTriggerVariant, final int minBaseQual, final ReduceReads.DownsampleStrategy downsampleStrategy) { @@ -83,7 +83,7 @@ public class SingleSampleCompressor { this.downsampleCoverage = downsampleCoverage; this.minMappingQuality = minMappingQuality; this.slidingWindowCounter = 0; - this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant; + this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant; this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; this.minBaseQual = minBaseQual; this.downsampleStrategy = downsampleStrategy; @@ -114,7 +114,7 @@ public class SingleSampleCompressor { } if ( slidingWindow == null) { // this is the first read - slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); + slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); slidingWindowCounter++; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 8a80c5570..5fd7724cb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -97,9 +97,8 @@ public class SlidingWindow { protected int filteredDataConsensusCounter; protected String filteredDataReadName; - // Additional parameters - protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches + protected double MIN_ALT_PVALUE_TO_TRIGGER_VARIANT; // pvalue has to be greater than this value to trigger variant region due to mismatches protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value protected int MIN_MAPPING_QUALITY; @@ -150,11 +149,15 @@ public class SlidingWindow { this.readsInWindow = new ObjectAVLTreeSet(); } - public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) { + public SlidingWindow(final String contig, final int contigIndex, final int contextSize, final SAMFileHeader samHeader, + final GATKSAMReadGroupRecord readGroupAttribute, final int windowNumber, + final double minAltPValueToTriggerVariant, final double minIndelProportionToTriggerVariant, + final int minBaseQual, final int minMappingQuality, final int downsampleCoverage, + final ReduceReads.DownsampleStrategy downsampleStrategy, final boolean hasIndelQualities) { this.contextSize = contextSize; this.downsampleCoverage = downsampleCoverage; - this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant; + this.MIN_ALT_PVALUE_TO_TRIGGER_VARIANT = minAltPValueToTriggerVariant; this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant; this.MIN_BASE_QUAL_TO_COUNT = minBaseQual; this.MIN_MAPPING_QUALITY = minMappingQuality; @@ -341,8 +344,14 @@ public class SlidingWindow { private final MarkedSites markedSites = new MarkedSites(); /** - * returns an array marked with variant and non-variant regions (it uses - * markVariantRegions to make the marks) + * returns the MarkedSites object so that it can be tested after adding data to the Sliding Window + * + * @return the Marked Sites object used by this Sliding Window + */ + protected MarkedSites getMarkedSitesForTesting() { return markedSites; } + + /** + * returns an array marked with variant and non-variant regions (it uses markVariantRegion to make the marks) * * @param stop check the window from start to stop (not-inclusive) */ @@ -368,8 +377,8 @@ public class SlidingWindow { if (headerElementIterator.hasNext()) { HeaderElement headerElement = headerElementIterator.next(); - if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) - markVariantRegion(markedSites, i - windowHeaderStartLocation); + if (headerElement.isVariant(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) + markVariantRegion(i - windowHeaderStartLocation); } else break; @@ -379,14 +388,24 @@ public class SlidingWindow { /** * Marks the sites around the variant site (as true) * - * @param markedSites the boolean array to bear the marks * @param variantSiteLocation the location where a variant site was found */ - protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) { + protected void markVariantRegion(final int variantSiteLocation) { int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize; - int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1; - for (int i = from; i < to; i++) - markedSites.getVariantSiteBitSet()[i] = true; + int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length - 1 : variantSiteLocation + contextSize; + markRegionAs(from, to, true); + } + + /** + * Marks the sites around the variant site (as true) + * + * @param from the start index (inclusive) to mark + * @param to the end index (inclusive) to mark + * @param isVariant mark the region with this boolean value + */ + private void markRegionAs(final int from, final int to, final boolean isVariant) { + for (int i = from; i <= to; i++) + markedSites.getVariantSiteBitSet()[i] = isVariant; } /** @@ -580,7 +599,7 @@ public class SlidingWindow { filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, strandType); } - genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS()); + genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts()); } return result; @@ -611,7 +630,7 @@ public class SlidingWindow { if (!headerElement.hasConsensusData()) throw new ReviewedStingException("No CONSENSUS data in " + index); - genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS()); + genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts()); } } @@ -620,15 +639,14 @@ public class SlidingWindow { * * @param syntheticRead the synthetic read to add to * @param baseCounts the base counts object in the header element - * @param rms the rms mapping quality in the header element */ - private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) { + private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) { final BaseIndex base = baseCounts.baseIndexWithMostProbability(); byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE); byte qual = baseCounts.averageQualsOfBase(base); byte insQual = baseCounts.averageInsertionQualsOfBase(base); byte delQual = baseCounts.averageDeletionQualsOfBase(base); - syntheticRead.add(base, count, qual, insQual, delQual, rms); + syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS()); } /** @@ -636,39 +654,30 @@ public class SlidingWindow { * * @param start the first window header index in the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here - * @return a non-null list of all reads contained in the variant region + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) + * @return a non-null object representing all reads contained in the variant region */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected ObjectList compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { - ObjectList allReads = new ObjectArrayList(); + protected CloseVariantRegionResult compressVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { + final CloseVariantRegionResult allReads = new CloseVariantRegionResult(stop); // Try to compress into a polyploid consensus - // Optimization: don't bother if there are no known SNPs - final int hetRefPosition = knownSnpPositions.isEmpty() ? -1 : findSinglePolyploidCompressiblePosition(start, stop); - - boolean successfullyCreatedPolyploidConsensus = false; + // Optimization: don't bother if there are no known SNPs here + final int hetRefPosition = (knownSnpPositions != null && knownSnpPositions.isEmpty()) ? -1 : findSinglePolyploidCompressiblePosition(start, stop); // Note that using the hetRefPosition protects us from trying to compress variant regions that are created by // insertions (which we don't want because we can't confirm that they represent the same allele). - // Also, we only allow polyploid consensus creation at known sites. + // Also, we only allow polyploid consensus creation at known sites if provided. if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) { - // try to create the polyploid consensus - final ObjectList polyploidReads = createPolyploidConsensus(start, stop, hetRefPosition); - - // if successful we are good to go! - if ( polyploidReads != null ) { - allReads.addAll(polyploidReads); - successfullyCreatedPolyploidConsensus = true; - } + allReads.reads.addAll(createPolyploidConsensus(hetRefPosition)); + allReads.stopPerformed = hetRefPosition; // we stopped at the het position } - // if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them // from the window header entirely; also remove all reads preceding the variant region (since they will be output // as consensus right after compression) - if ( !successfullyCreatedPolyploidConsensus ) { + else { final int refStart = windowHeader.get(start).getLocation(); final int refStop = windowHeader.get(stop).getLocation(); @@ -676,7 +685,7 @@ public class SlidingWindow { for ( final GATKSAMRecord read : readsInWindow ) { if ( read.getSoftStart() <= refStop ) { if ( read.getAlignmentEnd() >= refStart ) { - allReads.add(read); + allReads.reads.add(read); removeFromHeader(windowHeader, read); } toRemove.add(read); @@ -687,6 +696,7 @@ public class SlidingWindow { for ( final GATKSAMRecord read : toRemove ) readsInWindow.remove(read); } + return allReads; } @@ -694,13 +704,13 @@ public class SlidingWindow { * Determines whether the given position match one of the known sites * * @param targetPosition the position of the het site - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) * @return true if the targetPosition matches a known SNP position, false otherwise */ @Requires({"targetPosition >= 1 && knownSnpPositions != null"}) protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet knownSnpPositions) { final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition); - return knownSnpPositions.contains(targetLoc); + return knownSnpPositions == null || knownSnpPositions.contains(targetLoc); } /* @@ -716,7 +726,7 @@ public class SlidingWindow { for ( int i = start; i <= stop; i++ ) { - final int nAlleles = windowHeader.get(i).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false); + final int nAlleles = windowHeader.get(i).getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT); // we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios if ( nAlleles > 2 || nAlleles == -1 ) @@ -736,21 +746,22 @@ public class SlidingWindow { } /* - * Checks whether there's a position in the header with a significant number of softclips. + * Checks whether there's a position in the header with a significant number of softclips or a variant. * * @param header the window header to examine * @param positionToSkip the global position to skip in the examination (use negative number if you don't want to make use of this argument) * @return true if there exists a position with significant softclips, false otherwise */ @Requires("header != null") - protected boolean hasSignificantSoftclipPosition(final List header, final int positionToSkip) { + protected boolean hasPositionWithSignificantSoftclipsOrVariant(final List header, final int positionToSkip) { for ( final HeaderElement headerElement : header ) { if ( headerElement.getLocation() == positionToSkip ) continue; - if ( headerElement.hasSignificantSoftclips(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT) ) + if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) || + headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) > 1 ) return true; } @@ -762,29 +773,45 @@ public class SlidingWindow { * * @param start the first window header index in the variant region (inclusive) * @param stop the last window header index of the variant region (inclusive) - * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here - * @return a non-null list of all reads contained in the variant region plus any adjacent synthetic reads + * @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere) + * @return a non-null object representing all reads contained in the variant region plus any adjacent synthetic reads */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) @Ensures("result != null") - protected ObjectList closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { - ObjectList allReads = compressVariantRegion(start, stop, knownSnpPositions); + protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet knownSnpPositions) { + final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions); - ObjectList result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; - result.addAll(addToSyntheticReads(windowHeader, 0, stop+1, SyntheticRead.StrandType.STRANDLESS)); - result.addAll(finalizeAndAdd(ConsensusType.BOTH)); + final CloseVariantRegionResult result = new CloseVariantRegionResult(allReads.stopPerformed); + result.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(allReads.reads) : allReads.reads); + result.reads.addAll(addToSyntheticReads(windowHeader, 0, allReads.stopPerformed + 1, SyntheticRead.StrandType.STRANDLESS)); + result.reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); return result; // finalized reads will be downsampled if necessary } + /* + * @see #closeVariantRegions(CompressionStash, ObjectSortedSet, boolean) with forceCloseFullRegions set to false + */ + public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { + return closeVariantRegions(regions, knownSnpPositions, false); + } + + private static final class CloseVariantRegionResult { + final private ObjectList reads = new ObjectArrayList(); + private int stopPerformed; + + public CloseVariantRegionResult(final int stopPerformed) { this.stopPerformed = stopPerformed; } + } + /* * Finalizes the list of regions requested (and any regions preceding them) * * @param regions the list of regions to finalize - * @param knownSnpPositions the set of known SNP positions + * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere) + * @param forceCloseFullRegions if true, requires this method to make sure all regions are fully closed; otherwise, we may decide not to close up to the very end (e.g. during het compression) * @return a non-null set of reduced reads representing the finalized regions */ - public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions) { + public ObjectSet closeVariantRegions(final CompressionStash regions, final ObjectSortedSet knownSnpPositions, final boolean forceCloseFullRegions) { final ObjectAVLTreeSet allReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); if ( !regions.isEmpty() ) { @@ -794,9 +821,33 @@ public class SlidingWindow { for ( final GenomeLoc region : regions ) { if (((FinishedGenomeLoc)region).isFinished() && region.getContig().equals(contig) && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) { final int start = region.getStart() - windowHeaderStart; - final int stop = region.getStop() - windowHeaderStart; + int stop = region.getStop() - windowHeaderStart; - allReads.addAll(closeVariantRegion(start, stop, knownSnpPositions)); + CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions); + allReads.addAll(closeVariantRegionResult.reads); + + // check whether we didn't close the whole region that was requested + if ( stop > 0 && closeVariantRegionResult.stopPerformed < stop ) { + // we should update the variant sites bitset because the context size's worth of bases after the variant position are no longer "variant" + markRegionAs(closeVariantRegionResult.stopPerformed + 1, stop, false); + + // if the calling method said that it didn't care then we are okay so update the stop + if ( !forceCloseFullRegions ) { + stop = closeVariantRegionResult.stopPerformed; + } + // otherwise, we need to forcibly push the stop that we originally requested + else { + while ( closeVariantRegionResult.stopPerformed < stop ) { + // first clean up used header elements so they don't get reused + for ( int i = 0; i <= closeVariantRegionResult.stopPerformed; i++ ) + windowHeader.remove(); + stop -= (closeVariantRegionResult.stopPerformed + 1); + + closeVariantRegionResult = closeVariantRegion(0, stop, knownSnpPositions); + allReads.addAll(closeVariantRegionResult.reads); + } + } + } // We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions. // Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads @@ -847,7 +898,7 @@ public class SlidingWindow { * regions that still exist regardless of being able to fulfill the * context size requirement in the end. * - * @param knownSnpPositions the set of known SNP positions + * @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere) * @return A non-null set/list of all reads generated */ @Ensures("result != null") @@ -855,12 +906,11 @@ public class SlidingWindow { // mark variant regions ObjectSet finalizedReads = new ObjectAVLTreeSet(new AlignmentStartWithNoTiesComparator()); CompressionStash regions = new CompressionStash(); - boolean forceCloseUnfinishedRegions = true; if (!windowHeader.isEmpty()) { markSites(getStopLocation(windowHeader) + 1); - regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions); - finalizedReads = closeVariantRegions(regions, knownSnpPositions); + regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), true); + finalizedReads = closeVariantRegions(regions, knownSnpPositions, true); if (!windowHeader.isEmpty()) { finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS)); @@ -908,117 +958,105 @@ public class SlidingWindow { } // define this so that we can use Java generics below - private static class HeaderElementList extends LinkedList {} + private final static class HeaderElementList extends LinkedList {} + + private final static class SingleStrandConsensusData { + final HeaderElementList consensus = new HeaderElementList(); + final ObjectList reads = new ObjectArrayList(); + } /** - * Finalizes a variant region for point mutations, and any adjacent synthetic reads. Indel sites are not supported. + * Finalizes a variant region - and any adjacent synthetic reads - for point mutations (indel sites are not + * supported) with polyploid compression. * - * @param start the first window header index of the variant region (inclusive) - * @param stop the last window header index of the variant region (inclusive) * @param hetRefPosition window header index of the het site; MUST NOT BE AN INDEL SITE! - * @return a list of all reads contained in the variant region as a polyploid consensus, or null if not possible + * @return a non-null list of all reads contained in the variant region as a polyploid consensus */ @Requires({"start >= 0 && (stop >= start || stop == 0)"}) - protected ObjectList createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) { + @Ensures({"result != null"}) + protected ObjectList createPolyploidConsensus(final int hetRefPosition) { // we will create two (positive strand, negative strand) headers for each haplotype - final HeaderElementList[] headersPosStrand = new HeaderElementList[2]; - final HeaderElementList[] headersNegStrand = new HeaderElementList[2]; + final SingleStrandConsensusData[] headersPosStrand = new SingleStrandConsensusData[2]; + final SingleStrandConsensusData[] headersNegStrand = new SingleStrandConsensusData[2]; - final int refStart = windowHeader.get(start).getLocation(); - final int refStop = windowHeader.get(stop).getLocation(); final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation(); // initialize the mapping from base (allele) to header final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2); - for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false) ) { + for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) ) { final int currentIndex = alleleHeaderMap.size(); if ( currentIndex > 1 ) throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus"); alleleHeaderMap.put(allele.b, currentIndex); - headersPosStrand[currentIndex] = new HeaderElementList(); - headersNegStrand[currentIndex] = new HeaderElementList(); + headersPosStrand[currentIndex] = new SingleStrandConsensusData(); + headersNegStrand[currentIndex] = new SingleStrandConsensusData(); } // sanity check that we saw 2 alleles if ( alleleHeaderMap.size() != 2 ) throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size()); - final ObjectList toRemoveFromReadCache = new ObjectArrayList(); - final ObjectList toRemoveFromHeader = new ObjectArrayList(); + final ObjectList readsToRemoveFromHeader = new ObjectArrayList(); for ( final GATKSAMRecord read : readsInWindow ) { - // if the read falls after the region, just skip it for now (we'll get to it later) - if ( read.getSoftStart() > refStop ) + // if the read falls after the het position, just skip it for now (we'll get to it later) + if ( read.getSoftStart() > globalHetRefPosition ) continue; - // if the read falls before the region, remove it - if ( read.getSoftEnd() < refStart ) { - toRemoveFromReadCache.add(read); - continue; - } - - // check whether the read spans the het site - if ( read.getSoftStart() <= globalHetRefPosition && read.getSoftEnd() >= globalHetRefPosition ) { - - // make sure it meets the minimum mapping quality requirement (if not, we'll remove it and not use it for the consensuses) - if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) { - - // where on the read is the het position? - final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); - - // this is safe because indels are not supported - final byte base = read.getReadBases()[readPosOfHet]; - final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet]; - - // make sure that the base passes filters (if not, we'll remove it and not use it for the consensuses) - if ( qual >= MIN_BASE_QUAL_TO_COUNT ) { - - // check which allele this read represents - final Integer allele = alleleHeaderMap.get(base); - - // ignore the read if it represents a base that's not part of the consensus - if ( allele != null ) { - // add to the appropriate polyploid header - final LinkedList header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; - addToHeader(header, read); - } - } - } - - // remove from the standard header so that we don't double count it - toRemoveFromHeader.add(read); - } - - // we remove all reads falling inside the variant region from the window - toRemoveFromReadCache.add(read); - } - - // sanity check that no new "variant region" exists on just a single consensus strand - // due to softclips now that we've broken everything out into their component parts - for ( final LinkedList header : headersPosStrand ) { - if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) ) - return null; - } - for ( final LinkedList header : headersNegStrand ) { - if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) ) - return null; - } - - // create the polyploid synthetic reads - final ObjectList hetReads = new ObjectArrayList(); - for ( final LinkedList header : headersPosStrand ) - finalizeHetConsensus(header, false, hetReads); - for ( final LinkedList header : headersNegStrand ) - finalizeHetConsensus(header, true, hetReads); - - // remove all used reads - for ( final GATKSAMRecord read : toRemoveFromReadCache ) + // remove all other reads from the read cache since we're going to use them here readsInWindow.remove(read); - for ( final GATKSAMRecord read : toRemoveFromHeader ) + + // if the read falls before the het position, we don't need to look at it + if ( read.getSoftEnd() < globalHetRefPosition ) + continue; + + // remove all spanning reads from the consensus header since we're going to incorporate them into a consensus here instead removeFromHeader(windowHeader, read); + // make sure it meets the minimum mapping quality requirement (if not, we won't use it for the consensus) + if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) { + + // where on the read is the het position? + final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL); + + // this is safe because indels are not supported + final byte base = read.getReadBases()[readPosOfHet]; + final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet]; + + // check which allele this read represents + final Integer allele = alleleHeaderMap.get(base); + + // ignore the read if it represents a base that's not part of the consensus + if ( allele != null ) { + // add to the appropriate polyploid header + final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele]; + header.reads.add(read); + addToHeader(header.consensus, read); + } + } + } + + // create the polyploid synthetic reads if we can + final ObjectList hetReads = new ObjectArrayList(); + + // sanity check that no new "variant region" exists on just a single consensus strand due to softclips + // or multi-allelic sites now that we've broken everything out into their component parts. if one does + // exist then we need to back out the consensus for that strand only. + for ( final SingleStrandConsensusData header : headersPosStrand ) { + if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) ) + hetReads.addAll(header.reads); + else + finalizeHetConsensus(header.consensus, false, hetReads); + } + for ( final SingleStrandConsensusData header : headersNegStrand ) { + if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) ) + hetReads.addAll(header.reads); + else + finalizeHetConsensus(header.consensus, true, hetReads); + } + return hetReads; } @@ -1058,7 +1096,7 @@ public class SlidingWindow { int locationIndex = headerStart < 0 ? 0 : readStart - headerStart; if ( removeRead && locationIndex < 0 ) - throw new ReviewedStingException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header)); + throw new IllegalStateException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header)); // we only need to create new header elements if we are adding the read, not when we're removing it if ( !removeRead ) @@ -1138,6 +1176,8 @@ public class SlidingWindow { case H: break; case I: + readBaseIndex += cigarElement.getLength(); + // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it. if ( removeRead && locationIndex == 0 ) break; @@ -1150,7 +1190,6 @@ public class SlidingWindow { else headerElement.addInsertionToTheRight(); - readBaseIndex += cigarElement.getLength(); break; case D: // deletions are added to the baseCounts with the read mapping quality as it's quality score diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java index 597077742..4e5652c45 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/qc/AssessReducedQuals.java @@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker; import org.broadinstitute.sting.gatk.walkers.TreeReducible; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -99,7 +100,7 @@ public class AssessReducedQuals extends LocusWalker implem public int sufficientQualSum = 600; @Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > (epsilon * Quals_original_bam) we output this interval", required = false) - public double qual_epsilon = 0.25; + public double qual_epsilon = 0.10; @Output protected PrintStream out; @@ -145,7 +146,7 @@ public class AssessReducedQuals extends LocusWalker implem } private boolean isGoodRead(final PileupElement p) { - return !p.isDeletion() && (int)p.getQual() >= 20 && p.getMappingQual() >= 20; + return !p.isDeletion() && (int)p.getQual() >= 15 && p.getMappingQual() >= 20; } private int getTagIndex(final List tags) { diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java index 7f41836fa..5ae6e86df 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/BaseCountsUnitTest.java @@ -179,7 +179,7 @@ public class BaseCountsUnitTest extends BaseTest { BaseCounts counts = new BaseCounts(); for ( int qual : test.quals ) - counts.incr(BaseIndex.A, (byte)qual); + counts.incr(BaseIndex.A, (byte)qual, 20); final int actualSum = (int)counts.getSumQuals((byte)'A'); final int expectedSum = qualSum(test.quals); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java index 2f744e914..435c1029c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/HeaderElementUnitTest.java @@ -48,12 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; public class HeaderElementUnitTest extends BaseTest { @@ -119,16 +119,15 @@ public class HeaderElementUnitTest extends BaseTest { Assert.assertFalse(headerElement.hasFilteredData()); Assert.assertFalse(headerElement.hasInsertionToTheRight()); Assert.assertTrue(headerElement.isEmpty()); - Assert.assertEquals(headerElement.getRMS(), 0.0); } private void testHeaderData(final HeaderElement headerElement, final HETest test) { - Assert.assertEquals(headerElement.getRMS(), (double)test.MQ); Assert.assertEquals(headerElement.isVariantFromSoftClips(), test.isClip); Assert.assertFalse(headerElement.isEmpty()); Assert.assertFalse(headerElement.hasInsertionToTheRight()); - Assert.assertEquals(headerElement.hasConsensusData(), headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual)); - Assert.assertEquals(headerElement.hasFilteredData(), !headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual)); + Assert.assertEquals(headerElement.hasConsensusData(), test.MQ >= minMappingQual); + Assert.assertEquals(headerElement.hasFilteredData(), test.MQ < minMappingQual); + Assert.assertEquals(headerElement.hasConsensusData() ? headerElement.getConsensusBaseCounts().getRMS() : headerElement.getFilteredBaseCounts().getRMS(), (double)test.MQ); Assert.assertFalse(headerElement.isVariantFromMismatches(0.05)); Assert.assertEquals(headerElement.isVariant(0.05, 0.05), test.isClip); } @@ -136,13 +135,11 @@ public class HeaderElementUnitTest extends BaseTest { private class AllelesTest { public final int[] counts; - public final double proportion; - public final boolean allowDeletions; + public final double pvalue; - private AllelesTest(final int[] counts, final double proportion, final boolean allowDeletions) { + private AllelesTest(final int[] counts, final double pvalue) { this.counts = counts; - this.proportion = proportion; - this.allowDeletions = allowDeletions; + this.pvalue = pvalue; } } @@ -151,17 +148,15 @@ public class HeaderElementUnitTest extends BaseTest { List tests = new ArrayList(); final int[] counts = new int[]{ 0, 5, 10, 15, 20 }; - final double [] proportions = new double[]{ 0.0, 0.05, 0.10, 0.50, 1.0 }; + final double [] pvalues = new double[]{ 0.0, 0.01, 0.05, 0.20, 1.0 }; for ( final int countA : counts ) { for ( final int countC : counts ) { for ( final int countG : counts ) { for ( final int countT : counts ) { for ( final int countD : counts ) { - for ( final double proportion : proportions ) { - for ( final boolean allowDeletions : Arrays.asList(true, false) ) { - tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, proportion, allowDeletions)}); - } + for ( final double pvalue : pvalues ) { + tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, pvalue)}); } } } @@ -182,28 +177,33 @@ public class HeaderElementUnitTest extends BaseTest { headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false); } - final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion, test.allowDeletions); - final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion, test.allowDeletions); + final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue); + final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.pvalue); Assert.assertEquals(nAllelesSeen, nAllelesExpected); } - private static int calculateExpectedAlleles(final int[] counts, final double proportion, final boolean allowDeletions) { - double total = 0.0; + private static int calculateExpectedAlleles(final int[] counts, final double targetPvalue) { + int total = 0; for ( final int count : counts ) { total += count; } - final int minCount = Math.max(1, (int)(proportion * total)); - - if ( !allowDeletions && counts[BaseIndex.D.index] >= minCount ) - return -1; - int result = 0; - for ( final int count : counts ) { - if ( count > 0 && count >= minCount ) + for ( int index = 0; index < counts.length; index++ ) { + final int count = counts[index]; + if ( count == 0 ) + continue; + + final double pvalue = MathUtils.binomialCumulativeProbability(0, count + 1, total, 0.5); + + if ( pvalue > targetPvalue ) { + if ( index == BaseIndex.D.index ) + return -1; result++; + } } + return result; } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java index 65e930b89..1ab001147 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsIntegrationTest.java @@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.utils.collections.Pair; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.annotations.Test; import java.io.File; @@ -74,10 +75,10 @@ public class ReduceReadsIntegrationTest extends WalkerTest { final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e"; protected Pair, List> executeTest(final String name, final WalkerTestSpec spec) { - return executeTest(name, spec, false); + return executeTest(name, spec, emptyFileMd5); } - protected Pair, List> executeTest(final String name, final WalkerTestSpec spec, final boolean disableQualsTest) { + protected Pair, List> executeTest(final String name, final WalkerTestSpec spec, final String qualsTestMD5) { final Pair, List> result = super.executeTest(name, spec); // perform some Reduce Reads specific testing now @@ -93,15 +94,13 @@ public class ReduceReadsIntegrationTest extends WalkerTest { reducedInputs.append(file.getAbsolutePath()); } - // run the coverage test - final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs); - super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); + // the coverage test is a less stricter version of the quals test so we can safely ignore it for now + //final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs); + //super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); // run the quals test - if ( !disableQualsTest ) { - final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs); - super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5))); - } + final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs); + super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(qualsTestMD5))); } return result; @@ -147,62 +146,69 @@ public class ReduceReadsIntegrationTest extends WalkerTest { } private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) { - this.RRTest(testName, args, md5, useKnowns, false); + this.RRTest(testName, args, md5, useKnowns, emptyFileMd5); } - private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final boolean disableQualsTest) { + private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final String qualsTestMD5) { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " "; WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5)); - executeTest(testName, spec, disableQualsTest); + executeTest(testName, spec, qualsTestMD5); } @Test(enabled = true) public void testDefaultCompression() { - RRTest("testDefaultCompression ", L, "538362abd504200800145720b23c98ce", false); + RRTest("testDefaultCompression ", L, "62f8cdb85a424e42e9c56f36302d1dba", false); } @Test(enabled = true) public void testDefaultCompressionWithKnowns() { - RRTest("testDefaultCompressionWithKnowns ", L, "79cdbd997196957af63f46353cff710b", true); + RRTest("testDefaultCompressionWithKnowns ", L, "874c0e0a54c3db67f5e9d7c0d45b7844", true); } private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; @Test(enabled = true) public void testMultipleIntervals() { - RRTest("testMultipleIntervals ", intervals, "6733b25e87e3fce5753cf7936ccf934f", false); + RRTest("testMultipleIntervals ", intervals, "2e849f8324b27af36bae8cb9b01722e6", false); } @Test(enabled = true) public void testMultipleIntervalsWithKnowns() { - RRTest("testMultipleIntervalsWithKnowns ", intervals, "99e2a79befc71eaadb4197c66a0d6df8", true); + RRTest("testMultipleIntervalsWithKnowns ", intervals, "71bc2167cc6916288bd34dcf099feebc", true); } + final String highCompressionMD5 = "c83256fa2d6785d5188f50dd45c77e0f"; + @Test(enabled = true) public void testHighCompression() { - RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e3b7e14655973c8950d7fec96321e483", false); + RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -mindel 0.3 " + L, highCompressionMD5, false); } @Test(enabled = true) public void testHighCompressionWithKnowns() { - RRTest("testHighCompressionWithKnowns ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "30a7ed079b3a41ed63e520260fa6afe3", true); + RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -mindel 0.3 " + L, highCompressionMD5, true); } @Test(enabled = true) public void testLowCompression() { - // too much downsampling for quals test - RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", false, true); + RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a903558ef284381d74b0ad837deb19f6", false); } @Test(enabled = true) public void testLowCompressionWithKnowns() { - // too much downsampling for quals test - RRTest("testLowCompressionWithKnowns ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", true, true); + RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a4c5aa158c6ebbc703134cbe2d48619c", true); + } + + @Test(enabled = true) + public void testBadPvalueInput() { + final String cmd = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + "-o %s -min_pvalue -0.01"; + WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, UserException.BadArgumentValue.class); + executeTest("testBadPvalueInput", spec); } @Test(enabled = true) public void testIndelCompression() { - final String md5 = "f58ae2154e0e5716be0e850b7605856e"; + final String md5 = "56154baed62be07008d3684a0a4c0996"; RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false); RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true); } @@ -210,28 +216,25 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testFilteredDeletionCompression() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; - // don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB) - executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bfe0693aea74634f1035a9bd11302517")), true); + executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("d7655de41d90aecb716f79e32d53b2d1"))); } @Test(enabled = true) public void testCoReduction() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s "; - // don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB) - executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("930ec2e2c3b62bec7a2425a82c64f022")), true); + executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fa549ba96ca0ce5fbf3553ba173167e8"))); } @Test(enabled = true) public void testCoReductionWithKnowns() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s "; - // don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB) - executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fe7c9fd35e50a828e0f38a7ae25b60a7")), true); + executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9edcf09b21a4ae8d9fc25222bcb0486b"))); } @Test(enabled = true) public void testInsertionsAtEdgeOfConsensus() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s "; - executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4445db7aeddaf2f1d86e1af0cdc74c8"))); + executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("27cc8f1a336b2d0a29855ceb8fc988b0"))); } /** @@ -245,7 +248,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testAddingReadAfterTailingTheStash() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; - executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("f118e83c394d21d901a24230379864fc"))); + executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("34baf99904b676d5f132d3791030ed0a")), "3eab32c215ba68e75efd5ab7e9f7a2e7"); } /** @@ -256,7 +259,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testDivideByZero() { String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; // we expect to lose coverage due to the downsampling so don't run the systematic tests - executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bd5198a3e21034887b741faaaa3964bf"))); + executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("985c4f15a1d45267abb2f6790267930d"))); } /** @@ -266,7 +269,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { @Test(enabled = true) public void testReadOffContig() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s "; - executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4dc66445ddf5f467f67860bed023ef8"))); + executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("388ef48791965d637e4bdb45d5d7cf01"))); } /** @@ -276,8 +279,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest { public void testPairedReadsInVariantRegion() { String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) + " -o %s --downsample_coverage 250 -dcov 50 "; - // don't use quals test here (there's one location with low quals that won't pass; signed off by EB) - executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9bed260b6245f5ff47db8541405504aa")), true); + executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("cfa2588f5edf74c5ddf3d190f5ac6f2d"))); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java index 15b79b78a..6032affa7 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReadsUnitTest.java @@ -46,9 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads; -import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; -import it.unimi.dsi.fastutil.objects.ObjectArrayList; -import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet; +import it.unimi.dsi.fastutil.objects.*; import net.sf.samtools.SAMFileHeader; import org.broad.tribble.Feature; import org.broadinstitute.sting.BaseTest; @@ -58,6 +56,7 @@ import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -198,6 +197,7 @@ public class ReduceReadsUnitTest extends BaseTest { final ReduceReads rr = new ReduceReads(); RodBinding.resetNameCounter(); rr.known = Arrays.>asList(new RodBinding(VariantContext.class, "known")); + rr.knownSnpPositions = new ObjectAVLTreeSet(); final GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java index f081b9f8a..4bf67f5a2 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindowUnitTest.java @@ -200,17 +200,16 @@ public class SlidingWindowUnitTest extends BaseTest { @Test(enabled = true) public void testMarkVariantRegion() { final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition); - SlidingWindow.MarkedSites markedSites = slidingWindow.new MarkedSites(); - markedSites.updateRegion(100, 100); + slidingWindow.getMarkedSitesForTesting().updateRegion(100, 100); - slidingWindow.markVariantRegion(markedSites, 40); - Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 21); + slidingWindow.markVariantRegion(40); + Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 21); - slidingWindow.markVariantRegion(markedSites, 5); - Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 37); + slidingWindow.markVariantRegion(5); + Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 37); - slidingWindow.markVariantRegion(markedSites, 95); - Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 52); + slidingWindow.markVariantRegion(95); + Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 52); } private static int countTrueBits(final boolean[] bitset) { @@ -254,10 +253,12 @@ public class SlidingWindowUnitTest extends BaseTest { private class ConsensusCreationTest { public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression; public final List myReads = new ArrayList(20); + public final String description; private ConsensusCreationTest(final List locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) { this.expectedNumberOfReads = expectedNumberOfReads; this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; + this.description = String.format("%d %d", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression); // first, add the basic reads to the collection myReads.addAll(basicReads); @@ -270,6 +271,7 @@ public class SlidingWindowUnitTest extends BaseTest { private ConsensusCreationTest(final List locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) { this.expectedNumberOfReads = expectedNumberOfReads; this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression; + this.description = String.format("%s %d %d", operator.toString(), expectedNumberOfReads, expectedNumberOfReadsWithHetCompression); // first, add the basic reads to the collection myReads.addAll(basicReads); @@ -279,6 +281,8 @@ public class SlidingWindowUnitTest extends BaseTest { myReads.add(createVariantRead(loc, false, false, operator)); } + public String toString() { return description; } + private GATKSAMRecord createVariantRead(final GenomeLoc loc, final boolean readShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final CigarOperator operator) { @@ -315,7 +319,6 @@ public class SlidingWindowUnitTest extends BaseTest { private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295); private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309); private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310); - private static final GenomeLoc loc312 = new UnvalidatingGenomeLoc("1", 0, 1000312, 1000312); private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100); @DataProvider(name = "ConsensusCreation") @@ -328,7 +331,6 @@ public class SlidingWindowUnitTest extends BaseTest { tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc295), false, false, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc309), false, false, 10, 10)}); tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc310), false, false, 11, 11)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290, loc312), false, false, 11, 8)}); // test low quality reads tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(), true, false, 1, 1)}); @@ -346,7 +348,7 @@ public class SlidingWindowUnitTest extends BaseTest { // test mixture tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), true, false, 2, 2)}); - tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 3, 3)}); + tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc1100), false, true, 1, 1)}); // test I/D operators tests.add(new Object[]{new ConsensusCreationTest(Arrays.asList(loc290), CigarOperator.D, 9, 9)}); @@ -370,17 +372,22 @@ public class SlidingWindowUnitTest extends BaseTest { for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty - Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads); - // test WITH het compression + // test WITH het compression at KNOWN sites slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : test.myReads ) slidingWindow.addRead(read); for ( int i = 0; i < 1200; i++ ) knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); result = slidingWindow.close(knownSNPs); + Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); + // test WITH het compression at ALL sites + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : test.myReads ) + slidingWindow.addRead(read); + result = slidingWindow.close(null); Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression); } @@ -405,21 +412,26 @@ public class SlidingWindowUnitTest extends BaseTest { final ObjectAVLTreeSet knownSNPs = new ObjectAVLTreeSet(); // test WITHOUT het compression - SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : myReads ) slidingWindow.addRead(read); Pair, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty - Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all - // test WITH het compression - slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + // test WITH het compression at KNOWN sites + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); for ( final GATKSAMRecord read : myReads ) slidingWindow.addRead(read); for ( int i = 0; i < readLength; i++ ) knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i)); result = slidingWindow.close(knownSNPs); + Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all + // test WITH het compression at ALL sites + slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false); + for ( final GATKSAMRecord read : myReads ) + slidingWindow.addRead(read); + result = slidingWindow.close(knownSNPs); Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all } @@ -692,7 +704,7 @@ public class SlidingWindowUnitTest extends BaseTest { slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips); } - final boolean result = slidingWindow.hasSignificantSoftclipPosition(windowHeader, currentHeaderStart + indexToSkip); + final boolean result = slidingWindow.hasPositionWithSignificantSoftclipsOrVariant(windowHeader, currentHeaderStart + indexToSkip); Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index ebbc3945f..8f8ec10f5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -329,8 +329,8 @@ public class MathUtils { /** * Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space. * - * @param start - start of the cumulant sum (over hits) - * @param end - end of the cumulant sum (over hits) + * @param start - start (inclusive) of the cumulant sum (over hits) + * @param end - end (exclusive) of the cumulant sum (over hits) * @param total - number of attempts for the number of hits * @param probHit - probability of a successful hit * @return - returns the cumulative probability