Improve compression in Reduce Reads by incorporating probabilistic model and global het compression
The Problem:
Exomes seem to be more prone to base errors and one error in 20x coverage (or below, like most
regions in an exome) causes RR (with default settings) to consider it a variant region. This
seriously hurts compression performance.
The Solution:
1. We now use a probabilistic model for determining whether we can create a consensus (in other
words, whether we can error correct a site) instead of the old ratio threshold. We calculate
the cumulative binomial probability of seeing the given ratio and trigger consensus creation if
that pvalue is lower than the provided threshold (0.01 by default, so rather conservative).
2. We also allow het compression globally, not just at known sites. So if we cannot create a
consensus at a given site then we try to perform het compression; and if we cannot perform het
compression that we just don't reduce the variant region. This way very wonky regions stay
uncompressed, regions with one errorful read get fully compressed, and regions with one errorful
locus get het compressed.
Details:
1. -minvar is now deprecated in favor of -min_pvalue.
2. Added integration test for bad pvalue input.
3. -known argument still works to force het compression only at known sites; if it's not included
then we allow het compression anywhere. Added unit tests for this.
4. This commit includes fixes to het compression problems that were revealed by systematic qual testing.
Before finalizing het compression, we now check for insertions or other variant regions (usually due
to multi-allelics) which can render a region incompressible (and we back out if we find one). We
were checking for excessive softclips before, but now we add these tests too.
5. We now allow het compression on some but not all of the 4 consensus reads: if creating one of the
consensuses is not possible (e.g. because of excessive softclips) then we just back that one consensus
out instead of backing out all of them.
6. We no longer create a mini read at the stop of the variant window for het compression. Instead, we
allow it to be part of the next global consensus.
7. The coverage test is no longer run systematically on all integration tests because the quals test
supercedes it. The systematic quals test is now much stricter in order to catch bugs and edge cases
(very useful!).
8. Each consensus (both the normal and filtered) keep track of their own mapping qualities (before the MQ
for a consensus was affected by good and bad bases/reads).
9. We now completely ignore low quality bases, unless they are the only bases present in a pileup.
This way we preserve the span of reads across a region (needed for assembly). Min base qual moved to Q15.
10.Fixed long-standing bug where sliding window didn't do the right thing when removing reads that start
with insertions from a header.
Note that this commit must come serially before the next commit in which I am refactoring the binomial prob
code in MathUtils (which is failing and slow).
This commit is contained in:
parent
8e6309d56e
commit
df189293ce
|
|
@ -69,10 +69,30 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
private long sumInsertionQual_N = 0;
|
||||
private long sumDeletionQual_N = 0;
|
||||
|
||||
/*
|
||||
* Increments the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
*/
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) {
|
||||
// if we already have high quality bases, ignore low quality ones
|
||||
if ( isLowQualBase && !isLowQuality() )
|
||||
return;
|
||||
|
||||
// if this is a high quality base then remove any low quality bases and start from scratch
|
||||
if ( !isLowQualBase && isLowQuality() ) {
|
||||
if ( totalCount() > 0 )
|
||||
clear();
|
||||
setLowQuality(false);
|
||||
}
|
||||
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
super.incr(i, baseQual);
|
||||
super.incr(i, baseQual, baseMappingQual);
|
||||
switch (i) {
|
||||
case A: sumInsertionQual_A += insQual; sumDeletionQual_A += delQual; break;
|
||||
case C: sumInsertionQual_C += insQual; sumDeletionQual_C += delQual; break;
|
||||
|
|
@ -84,9 +104,23 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
/*
|
||||
* Decrements the count
|
||||
*
|
||||
* @param base the base
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the insertion quality
|
||||
* @param delQual the deletion quality
|
||||
* @param baseMappingQual the mapping quality
|
||||
* @param isLowQualBase true if the base is low quality
|
||||
*/
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual, final int baseMappingQual, final boolean isLowQualBase) {
|
||||
// if this is not the right type of base, ignore it
|
||||
if ( isLowQualBase != isLowQuality() )
|
||||
return;
|
||||
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
super.decr(i, baseQual);
|
||||
super.decr(i, baseQual, baseMappingQual);
|
||||
switch (i) {
|
||||
case A: sumInsertionQual_A -= insQual; sumDeletionQual_A -= delQual; break;
|
||||
case C: sumInsertionQual_C -= insQual; sumDeletionQual_C -= delQual; break;
|
||||
|
|
@ -131,4 +165,13 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
default: throw new IllegalArgumentException(base.name());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Clears out all stored data in this object
|
||||
*/
|
||||
public void clear() {
|
||||
super.clear();
|
||||
sumInsertionQual_A = sumInsertionQual_C = sumInsertionQual_G = sumInsertionQual_T = sumInsertionQual_D = sumInsertionQual_I = sumInsertionQual_N = 0;
|
||||
sumDeletionQual_A = sumDeletionQual_C = sumDeletionQual_G = sumDeletionQual_T = sumDeletionQual_D = sumDeletionQual_I = sumDeletionQual_N = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,6 +48,8 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
|
||||
/**
|
||||
|
|
@ -78,6 +80,8 @@ import com.google.java.contract.Requires;
|
|||
private int count_N = 0;
|
||||
private int sumQual_N = 0;
|
||||
private int totalCount = 0; // keeps track of total count since this is requested so often
|
||||
private final IntArrayList mappingQualities = new IntArrayList(); // keeps the mapping quality of each read that contributed to this
|
||||
private boolean isLowQuality = true; // this object represents low quality bases unless we are told otherwise
|
||||
|
||||
|
||||
public static BaseCounts createWithCounts(int[] countsACGT) {
|
||||
|
|
@ -100,6 +104,7 @@ import com.google.java.contract.Requires;
|
|||
this.count_I += other.count_I;
|
||||
this.count_N += other.count_N;
|
||||
this.totalCount += other.totalCount;
|
||||
this.mappingQualities.addAll(other.mappingQualities);
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
|
|
@ -112,6 +117,7 @@ import com.google.java.contract.Requires;
|
|||
this.count_I -= other.count_I;
|
||||
this.count_N -= other.count_N;
|
||||
this.totalCount -= other.totalCount;
|
||||
this.mappingQualities.removeAll(other.mappingQualities);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
|
|
@ -120,7 +126,7 @@ import com.google.java.contract.Requires;
|
|||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(final BaseIndex base, final byte qual) {
|
||||
public void incr(final BaseIndex base, final byte qual, final int mappingQuality) {
|
||||
switch (base) {
|
||||
case A: ++count_A; sumQual_A += qual; break;
|
||||
case C: ++count_C; sumQual_C += qual; break;
|
||||
|
|
@ -131,6 +137,7 @@ import com.google.java.contract.Requires;
|
|||
case N: ++count_N; sumQual_N += qual; break;
|
||||
}
|
||||
++totalCount;
|
||||
mappingQualities.add(mappingQuality);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
|
|
@ -152,7 +159,7 @@ import com.google.java.contract.Requires;
|
|||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(final BaseIndex base, final byte qual) {
|
||||
public void decr(final BaseIndex base, final byte qual, final int mappingQuality) {
|
||||
switch (base) {
|
||||
case A: --count_A; sumQual_A -= qual; break;
|
||||
case C: --count_C; sumQual_C -= qual; break;
|
||||
|
|
@ -163,6 +170,7 @@ import com.google.java.contract.Requires;
|
|||
case N: --count_N; sumQual_N -= qual; break;
|
||||
}
|
||||
--totalCount;
|
||||
mappingQualities.remove((Integer) mappingQuality);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
|
|
@ -229,6 +237,15 @@ import com.google.java.contract.Requires;
|
|||
return totalCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* The RMS of the mapping qualities of all reads that contributed to this object
|
||||
*
|
||||
* @return the RMS of the mapping qualities of all reads that contributed to this object
|
||||
*/
|
||||
public double getRMS() {
|
||||
return MathUtils.rms(mappingQualities);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
|
|
@ -325,4 +342,26 @@ import com.google.java.contract.Requires;
|
|||
final int total = totalCountWithoutIndels();
|
||||
return (total == 0) ? 0.0 : (double)countOfBase(base) / (double)total;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if this instance represents low quality bases
|
||||
*/
|
||||
public boolean isLowQuality() { return isLowQuality; }
|
||||
|
||||
/**
|
||||
* Sets the low quality value
|
||||
*
|
||||
* @param value true if this instance represents low quality bases false otherwise
|
||||
*/
|
||||
public void setLowQuality(final boolean value) { isLowQuality = value; }
|
||||
|
||||
/**
|
||||
* Clears out all stored data in this object
|
||||
*/
|
||||
public void clear() {
|
||||
count_A = count_C = count_G = count_T = count_D = count_I = count_N = 0;
|
||||
sumQual_A = sumQual_C = sumQual_G = sumQual_T = sumQual_D = sumQual_I = sumQual_N = 0;
|
||||
totalCount = 0;
|
||||
mappingQualities.clear();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,14 +46,10 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The element that describes the header of the sliding window.
|
||||
|
|
@ -68,7 +64,6 @@ public class HeaderElement {
|
|||
private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right
|
||||
private int nSoftClippedBases; // How many bases in this site came from soft clipped bases
|
||||
private int location; // Genome location of this site (the sliding window knows which contig we're at
|
||||
private IntArrayList mappingQuality; // keeps the mapping quality of each read that contributed to this element (site)
|
||||
|
||||
public int getLocation() {
|
||||
return location;
|
||||
|
|
@ -89,7 +84,7 @@ public class HeaderElement {
|
|||
* @param location the reference location for the new element
|
||||
*/
|
||||
public HeaderElement(final int location) {
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new IntArrayList());
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -99,7 +94,7 @@ public class HeaderElement {
|
|||
* @param location the reference location for the new element
|
||||
*/
|
||||
public HeaderElement(final int location, final int insertionsToTheRight) {
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location, new IntArrayList());
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), insertionsToTheRight, 0, location);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -110,16 +105,14 @@ public class HeaderElement {
|
|||
* @param insertionsToTheRight number of insertions to the right of this HeaderElement
|
||||
* @param nSoftClippedBases number of softclipped bases of this HeaderElement
|
||||
* @param location the reference location of this reference element
|
||||
* @param mappingQuality the list of mapping quality values of all reads that contributed to this
|
||||
* HeaderElement
|
||||
*/
|
||||
public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, IntArrayList mappingQuality) {
|
||||
public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location) {
|
||||
this.consensusBaseCounts = consensusBaseCounts;
|
||||
this.filteredBaseCounts = filteredBaseCounts;
|
||||
this.insertionsToTheRight = insertionsToTheRight;
|
||||
this.nSoftClippedBases = nSoftClippedBases;
|
||||
this.location = location;
|
||||
this.mappingQuality = mappingQuality;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -128,35 +121,52 @@ public class HeaderElement {
|
|||
*
|
||||
* @return true if site is variant by any definition. False otherwise.
|
||||
*/
|
||||
public boolean isVariant(double minVariantProportion, double minIndelProportion) {
|
||||
return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
|
||||
public boolean isVariant(double minVariantPvalue, double minIndelProportion) {
|
||||
return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantPvalue) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new base to the HeaderElement updating all counts accordingly
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param base the base to add
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the base insertion quality
|
||||
* @param delQual the base deletion quality
|
||||
* @param baseMappingQuality the mapping quality of the read this base belongs to
|
||||
* @param minBaseQual the minimum base qual allowed to be a good base
|
||||
* @param minMappingQual the minimum mapping qual allowed to be a good read
|
||||
* @param isSoftClipped true if the base is soft-clipped in the original read
|
||||
*/
|
||||
public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
|
||||
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
|
||||
consensusBaseCounts.incr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
|
||||
// If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
|
||||
if ( baseMappingQuality >= minMappingQual )
|
||||
consensusBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
|
||||
else
|
||||
filteredBaseCounts.incr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
|
||||
filteredBaseCounts.incr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
|
||||
|
||||
this.mappingQuality.add(baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
|
||||
nSoftClippedBases += isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
|
||||
nSoftClippedBases += isSoftClipped ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new base to the HeaderElement updating all counts accordingly
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param baseQual the base quality
|
||||
* @param insQual the base insertion quality
|
||||
* @param delQual the base deletion quality
|
||||
* @param baseMappingQuality the mapping quality of the read this base belongs to
|
||||
* @param minBaseQual the minimum base qual allowed to be a good base
|
||||
* @param minMappingQual the minimum mapping qual allowed to be a good read
|
||||
* @param isSoftClipped true if the base is soft-clipped in the original read
|
||||
*/
|
||||
public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
|
||||
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
|
||||
consensusBaseCounts.decr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
|
||||
// If the base passes the MQ filter it is included in the consensus base counts, otherwise it's part of the filtered counts
|
||||
if ( baseMappingQuality >= minMappingQual )
|
||||
consensusBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
|
||||
else
|
||||
filteredBaseCounts.decr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
|
||||
filteredBaseCounts.decr(base, baseQual, insQual, delQual, baseMappingQuality, baseQual < minBaseQual);
|
||||
|
||||
this.mappingQuality.remove((Integer) baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
|
||||
nSoftClippedBases -= isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
|
||||
nSoftClippedBases -= isSoftClipped ? 1 : 0;
|
||||
}
|
||||
/**
|
||||
* Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions
|
||||
|
|
@ -193,15 +203,6 @@ public class HeaderElement {
|
|||
return (!hasFilteredData() && !hasConsensusData());
|
||||
}
|
||||
|
||||
/**
|
||||
* The RMS of the mapping qualities of all reads that contributed to this HeaderElement
|
||||
*
|
||||
* @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement
|
||||
*/
|
||||
public double getRMS() {
|
||||
return MathUtils.rms(mappingQuality);
|
||||
}
|
||||
|
||||
/**
|
||||
* removes an insertion from this element (if you removed a read that had an insertion)
|
||||
*/
|
||||
|
|
@ -236,7 +237,7 @@ public class HeaderElement {
|
|||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess deletions
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
* @return whether or not the HeaderElement is variant due to excess deletions
|
||||
*/
|
||||
private boolean isVariantFromDeletions(double minIndelProportion) {
|
||||
return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion;
|
||||
|
|
@ -245,12 +246,15 @@ public class HeaderElement {
|
|||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess mismatches
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @return whether or not the HeaderElement is variant due to excess mismatches
|
||||
*/
|
||||
protected boolean isVariantFromMismatches(double minVariantProportion) {
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
|
||||
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
|
||||
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
|
||||
protected boolean isVariantFromMismatches(double minVariantPvalue) {
|
||||
final int totalCount = consensusBaseCounts.totalCountWithoutIndels();
|
||||
final BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
|
||||
final int countOfOtherBases = totalCount - consensusBaseCounts.countOfBase(mostCommon);
|
||||
final double pvalue = countOfOtherBases == 0 ? 0.0 : MathUtils.binomialCumulativeProbability(0, countOfOtherBases+1, totalCount, 0.5);
|
||||
return pvalue > minVariantPvalue;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -263,42 +267,44 @@ public class HeaderElement {
|
|||
return nSoftClippedBases > 0 && nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases);
|
||||
}
|
||||
|
||||
protected boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) {
|
||||
return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of alleles necessary to represent this site.
|
||||
*
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @param allowDeletions should we allow deletions?
|
||||
* @return the number of alleles necessary to represent this site or -1 if allowDeletions is false and there are a sufficient number of them
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @return the number of alleles necessary to represent this site or -1 if there are too many indels
|
||||
*/
|
||||
public int getNumberOfAlleles(final double minVariantProportion, final boolean allowDeletions) {
|
||||
final List<BaseIndex> alleles = getAlleles(minVariantProportion, allowDeletions);
|
||||
public int getNumberOfBaseAlleles(final double minVariantPvalue) {
|
||||
final ObjectArrayList<BaseIndex> alleles = getAlleles(minVariantPvalue);
|
||||
return alleles == null ? -1 : alleles.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the alleles necessary to represent this site.
|
||||
*
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @param allowDeletions should we allow deletions?
|
||||
* @return the list of alleles necessary to represent this site or null if allowDeletions is false and there are a sufficient number of them
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @return the list of alleles necessary to represent this site or null if there are too many indels
|
||||
*/
|
||||
public List<BaseIndex> getAlleles(final double minVariantProportion, final boolean allowDeletions) {
|
||||
public ObjectArrayList<BaseIndex> getAlleles(final double minVariantPvalue) {
|
||||
// make sure we have bases at all
|
||||
final int totalBaseCount = consensusBaseCounts.totalCount();
|
||||
if ( totalBaseCount == 0 )
|
||||
return Collections.emptyList();
|
||||
return new ObjectArrayList<BaseIndex>(0);
|
||||
|
||||
final int minBaseCountForRelevantAlleles = Math.max(1, (int)(minVariantProportion * totalBaseCount));
|
||||
// next, check for insertions
|
||||
if ( hasSignificantCount(insertionsToTheRight, minVariantPvalue) )
|
||||
return null;
|
||||
|
||||
final List<BaseIndex> alleles = new ArrayList<BaseIndex>(4);
|
||||
// finally, check for the bases themselves (including deletions)
|
||||
final ObjectArrayList<BaseIndex> alleles = new ObjectArrayList<BaseIndex>(4);
|
||||
for ( final BaseIndex base : BaseIndex.values() ) {
|
||||
final int baseCount = consensusBaseCounts.countOfBase(base);
|
||||
if ( baseCount == 0 )
|
||||
continue;
|
||||
|
||||
if ( baseCount >= minBaseCountForRelevantAlleles ) {
|
||||
if ( !allowDeletions && base == BaseIndex.D )
|
||||
final double pvalue = MathUtils.binomialCumulativeProbability(0, baseCount+1, totalBaseCount, 0.5);
|
||||
|
||||
if ( pvalue > minVariantPvalue ) {
|
||||
if ( base == BaseIndex.D )
|
||||
return null;
|
||||
alleles.add(base);
|
||||
}
|
||||
|
|
@ -309,15 +315,26 @@ public class HeaderElement {
|
|||
/*
|
||||
* Checks whether there are a significant number of softclips.
|
||||
*
|
||||
* @param minVariantProportion the minimum proportion to consider something significant.
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @return true if there are significant softclips, false otherwise
|
||||
*/
|
||||
public boolean hasSignificantSoftclips(final double minVariantProportion) {
|
||||
public boolean hasSignificantSoftclips(final double minVariantPvalue) {
|
||||
return hasSignificantCount(nSoftClippedBases, minVariantPvalue);
|
||||
}
|
||||
|
||||
/*
|
||||
* Checks whether there are a significant number of count.
|
||||
*
|
||||
* @param count the count to test against
|
||||
* @param minVariantPvalue the minimum pvalue to call a site variant.
|
||||
* @return true if there is a significant count given the provided pvalue, false otherwise
|
||||
*/
|
||||
private boolean hasSignificantCount(final int count, final double minVariantPvalue) {
|
||||
final int totalBaseCount = consensusBaseCounts.totalCount();
|
||||
if ( totalBaseCount == 0 )
|
||||
if ( count == 0 || totalBaseCount == 0 )
|
||||
return false;
|
||||
|
||||
final int minBaseCountForSignificance = Math.max(1, (int)(minVariantProportion * totalBaseCount));
|
||||
return nSoftClippedBases >= minBaseCountForSignificance;
|
||||
final double pvalue = MathUtils.binomialCumulativeProbability(0, count+1, totalBaseCount, 0.5);
|
||||
return pvalue > minVariantPvalue;
|
||||
}
|
||||
}
|
||||
|
|
@ -96,14 +96,14 @@ public class MultiSampleCompressor {
|
|||
final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minAltPValueToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleCompressor(contextSize, downsampleCoverage,
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
minMappingQuality, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -64,6 +64,7 @@ import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -139,19 +140,21 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
|
||||
public byte minBaseQual = 20;
|
||||
public byte minBaseQual = 15;
|
||||
|
||||
/**
|
||||
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
|
||||
* lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
|
||||
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases at the tails with
|
||||
* quality at or lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
|
||||
*/
|
||||
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
|
||||
public byte minTailQuality = 2;
|
||||
|
||||
/**
|
||||
* Any number of VCF files representing known SNPs to be used for the experimental polyploid-based reduction.
|
||||
* Any number of VCF files representing known SNPs to be used for the polyploid-based reduction.
|
||||
* Could be e.g. dbSNP and/or official 1000 Genomes SNP calls. Non-SNP variants in these files will be ignored.
|
||||
* Note that polyploid ("het") compression will work only when a single SNP is present in a consensus window.
|
||||
* If provided, the polyploid ("het") compression will work only when a single SNP from the known set is present
|
||||
* in a consensus window (otherwise there will be no reduction); if not provided then polyploid compression will
|
||||
* be triggered anywhere there is a single SNP present in a consensus window.
|
||||
*/
|
||||
@Input(fullName="known_sites_for_polyploid_reduction", shortName = "known", doc="Input VCF file(s) with known SNPs", required=false)
|
||||
public List<RodBinding<VariantContext>> known = Collections.emptyList();
|
||||
|
|
@ -204,9 +207,18 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
*/
|
||||
@Deprecated
|
||||
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
|
||||
public double minAltProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Minimum p-value from binomial distribution of mismatches in a site to trigger a variant region.
|
||||
* Any site with a value falling below this will be considered consensus and reduced (otherwise we will try to trigger polyploid compression).
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "minimum_alt_pvalue_to_trigger_variant", shortName = "min_pvalue", doc = "", required = false)
|
||||
public double minAltPValueToTriggerVariant = 0.01;
|
||||
|
||||
/**
|
||||
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
|
|
@ -253,7 +265,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
|
||||
ObjectSortedSet<GenomeLoc> intervalList;
|
||||
|
||||
final ObjectSortedSet<GenomeLoc> knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
ObjectSortedSet<GenomeLoc> knownSnpPositions;
|
||||
|
||||
// IMPORTANT: DO NOT CHANGE THE VALUE OF THIS CONSTANT VARIABLE; IT IS NOW PERMANENTLY THE @PG NAME THAT EXTERNAL TOOLS LOOK FOR IN THE BAM HEADER
|
||||
public static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
|
||||
|
|
@ -273,6 +285,14 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
if ( nwayout && out != null )
|
||||
throw new UserException.CommandLineException("--out and --nwayout can not be used simultaneously; please use one or the other");
|
||||
|
||||
if ( minAltPValueToTriggerVariant < 0.0 || minAltPValueToTriggerVariant > 1.0 )
|
||||
throw new UserException.BadArgumentValue("--minimum_alt_pvalue_to_trigger_variant", "must be a value between 0 and 1 (inclusive)");
|
||||
|
||||
if ( known.isEmpty() )
|
||||
knownSnpPositions = null;
|
||||
else
|
||||
knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
GenomeAnalysisEngine toolkit = getToolkit();
|
||||
readNameHash = new Object2LongOpenHashMap<String>(100000); // prepare the read name hash to keep track of what reads have had their read names compressed
|
||||
intervalList = new ObjectAVLTreeSet<GenomeLoc>(); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
|
||||
|
|
@ -392,7 +412,7 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
*/
|
||||
@Override
|
||||
public ReduceReadsStash reduceInit() {
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -470,8 +490,8 @@ public class ReduceReads extends ReadWalker<ObjectArrayList<GATKSAMRecord>, Redu
|
|||
* @param read the current read, used for checking whether there are stale positions we can remove
|
||||
*/
|
||||
protected void clearStaleKnownPositions(final GATKSAMRecord read) {
|
||||
// nothing to clear if empty
|
||||
if ( knownSnpPositions.isEmpty() )
|
||||
// nothing to clear if not used or empty
|
||||
if ( knownSnpPositions == null || knownSnpPositions.isEmpty() )
|
||||
return;
|
||||
|
||||
// not ready to be cleared until we encounter a read from a different contig
|
||||
|
|
|
|||
|
|
@ -62,7 +62,7 @@ public class SingleSampleCompressor {
|
|||
final private int contextSize;
|
||||
final private int downsampleCoverage;
|
||||
final private int minMappingQuality;
|
||||
final private double minAltProportionToTriggerVariant;
|
||||
final private double minAltPValueToTriggerVariant;
|
||||
final private double minIndelProportionToTriggerVariant;
|
||||
final private int minBaseQual;
|
||||
final private ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
|
|
@ -75,7 +75,7 @@ public class SingleSampleCompressor {
|
|||
public SingleSampleCompressor(final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minAltPValueToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
|
|
@ -83,7 +83,7 @@ public class SingleSampleCompressor {
|
|||
this.downsampleCoverage = downsampleCoverage;
|
||||
this.minMappingQuality = minMappingQuality;
|
||||
this.slidingWindowCounter = 0;
|
||||
this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant;
|
||||
this.minAltPValueToTriggerVariant = minAltPValueToTriggerVariant;
|
||||
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
|
||||
this.minBaseQual = minBaseQual;
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
|
|
@ -114,7 +114,7 @@ public class SingleSampleCompressor {
|
|||
}
|
||||
|
||||
if ( slidingWindow == null) { // this is the first read
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltPValueToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
|
||||
slidingWindowCounter++;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -97,9 +97,8 @@ public class SlidingWindow {
|
|||
protected int filteredDataConsensusCounter;
|
||||
protected String filteredDataReadName;
|
||||
|
||||
|
||||
// Additional parameters
|
||||
protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches
|
||||
protected double MIN_ALT_PVALUE_TO_TRIGGER_VARIANT; // pvalue has to be greater than this value to trigger variant region due to mismatches
|
||||
protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions
|
||||
protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value
|
||||
protected int MIN_MAPPING_QUALITY;
|
||||
|
|
@ -150,11 +149,15 @@ public class SlidingWindow {
|
|||
this.readsInWindow = new ObjectAVLTreeSet<GATKSAMRecord>();
|
||||
}
|
||||
|
||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) {
|
||||
public SlidingWindow(final String contig, final int contigIndex, final int contextSize, final SAMFileHeader samHeader,
|
||||
final GATKSAMReadGroupRecord readGroupAttribute, final int windowNumber,
|
||||
final double minAltPValueToTriggerVariant, final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual, final int minMappingQuality, final int downsampleCoverage,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy, final boolean hasIndelQualities) {
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
|
||||
this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant;
|
||||
this.MIN_ALT_PVALUE_TO_TRIGGER_VARIANT = minAltPValueToTriggerVariant;
|
||||
this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant;
|
||||
this.MIN_BASE_QUAL_TO_COUNT = minBaseQual;
|
||||
this.MIN_MAPPING_QUALITY = minMappingQuality;
|
||||
|
|
@ -341,8 +344,14 @@ public class SlidingWindow {
|
|||
private final MarkedSites markedSites = new MarkedSites();
|
||||
|
||||
/**
|
||||
* returns an array marked with variant and non-variant regions (it uses
|
||||
* markVariantRegions to make the marks)
|
||||
* returns the MarkedSites object so that it can be tested after adding data to the Sliding Window
|
||||
*
|
||||
* @return the Marked Sites object used by this Sliding Window
|
||||
*/
|
||||
protected MarkedSites getMarkedSitesForTesting() { return markedSites; }
|
||||
|
||||
/**
|
||||
* returns an array marked with variant and non-variant regions (it uses markVariantRegion to make the marks)
|
||||
*
|
||||
* @param stop check the window from start to stop (not-inclusive)
|
||||
*/
|
||||
|
|
@ -368,8 +377,8 @@ public class SlidingWindow {
|
|||
if (headerElementIterator.hasNext()) {
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
|
||||
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
|
||||
markVariantRegion(markedSites, i - windowHeaderStartLocation);
|
||||
if (headerElement.isVariant(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
|
||||
markVariantRegion(i - windowHeaderStartLocation);
|
||||
|
||||
} else
|
||||
break;
|
||||
|
|
@ -379,14 +388,24 @@ public class SlidingWindow {
|
|||
/**
|
||||
* Marks the sites around the variant site (as true)
|
||||
*
|
||||
* @param markedSites the boolean array to bear the marks
|
||||
* @param variantSiteLocation the location where a variant site was found
|
||||
*/
|
||||
protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) {
|
||||
protected void markVariantRegion(final int variantSiteLocation) {
|
||||
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
|
||||
int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1;
|
||||
for (int i = from; i < to; i++)
|
||||
markedSites.getVariantSiteBitSet()[i] = true;
|
||||
int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length - 1 : variantSiteLocation + contextSize;
|
||||
markRegionAs(from, to, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks the sites around the variant site (as true)
|
||||
*
|
||||
* @param from the start index (inclusive) to mark
|
||||
* @param to the end index (inclusive) to mark
|
||||
* @param isVariant mark the region with this boolean value
|
||||
*/
|
||||
private void markRegionAs(final int from, final int to, final boolean isVariant) {
|
||||
for (int i = from; i <= to; i++)
|
||||
markedSites.getVariantSiteBitSet()[i] = isVariant;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -580,7 +599,7 @@ public class SlidingWindow {
|
|||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), hasIndelQualities, strandType);
|
||||
}
|
||||
|
||||
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
|
||||
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts());
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
@ -611,7 +630,7 @@ public class SlidingWindow {
|
|||
if (!headerElement.hasConsensusData())
|
||||
throw new ReviewedStingException("No CONSENSUS data in " + index);
|
||||
|
||||
genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS());
|
||||
genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -620,15 +639,14 @@ public class SlidingWindow {
|
|||
*
|
||||
* @param syntheticRead the synthetic read to add to
|
||||
* @param baseCounts the base counts object in the header element
|
||||
* @param rms the rms mapping quality in the header element
|
||||
*/
|
||||
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
|
||||
private void genericAddBaseToConsensus(final SyntheticRead syntheticRead, final BaseAndQualsCounts baseCounts) {
|
||||
final BaseIndex base = baseCounts.baseIndexWithMostProbability();
|
||||
byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfBase(base);
|
||||
byte insQual = baseCounts.averageInsertionQualsOfBase(base);
|
||||
byte delQual = baseCounts.averageDeletionQualsOfBase(base);
|
||||
syntheticRead.add(base, count, qual, insQual, delQual, rms);
|
||||
syntheticRead.add(base, count, qual, insQual, delQual, baseCounts.getRMS());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -636,39 +654,30 @@ public class SlidingWindow {
|
|||
*
|
||||
* @param start the first window header index in the variant region (inclusive)
|
||||
* @param stop the last window header index of the variant region (inclusive)
|
||||
* @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here
|
||||
* @return a non-null list of all reads contained in the variant region
|
||||
* @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere)
|
||||
* @return a non-null object representing all reads contained in the variant region
|
||||
*/
|
||||
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
|
||||
@Ensures("result != null")
|
||||
protected ObjectList<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectList<GATKSAMRecord> allReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
protected CloseVariantRegionResult compressVariantRegion(final int start, final int stop, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
final CloseVariantRegionResult allReads = new CloseVariantRegionResult(stop);
|
||||
|
||||
// Try to compress into a polyploid consensus
|
||||
// Optimization: don't bother if there are no known SNPs
|
||||
final int hetRefPosition = knownSnpPositions.isEmpty() ? -1 : findSinglePolyploidCompressiblePosition(start, stop);
|
||||
|
||||
boolean successfullyCreatedPolyploidConsensus = false;
|
||||
// Optimization: don't bother if there are no known SNPs here
|
||||
final int hetRefPosition = (knownSnpPositions != null && knownSnpPositions.isEmpty()) ? -1 : findSinglePolyploidCompressiblePosition(start, stop);
|
||||
|
||||
// Note that using the hetRefPosition protects us from trying to compress variant regions that are created by
|
||||
// insertions (which we don't want because we can't confirm that they represent the same allele).
|
||||
// Also, we only allow polyploid consensus creation at known sites.
|
||||
// Also, we only allow polyploid consensus creation at known sites if provided.
|
||||
if ( hetRefPosition != -1 && matchesKnownPosition(windowHeader.get(hetRefPosition).getLocation(), knownSnpPositions) ) {
|
||||
|
||||
// try to create the polyploid consensus
|
||||
final ObjectList<GATKSAMRecord> polyploidReads = createPolyploidConsensus(start, stop, hetRefPosition);
|
||||
|
||||
// if successful we are good to go!
|
||||
if ( polyploidReads != null ) {
|
||||
allReads.addAll(polyploidReads);
|
||||
successfullyCreatedPolyploidConsensus = true;
|
||||
}
|
||||
allReads.reads.addAll(createPolyploidConsensus(hetRefPosition));
|
||||
allReads.stopPerformed = hetRefPosition; // we stopped at the het position
|
||||
}
|
||||
|
||||
// if we can't create a polyploid consensus here, return all reads that overlap the variant region and remove them
|
||||
// from the window header entirely; also remove all reads preceding the variant region (since they will be output
|
||||
// as consensus right after compression)
|
||||
if ( !successfullyCreatedPolyploidConsensus ) {
|
||||
else {
|
||||
final int refStart = windowHeader.get(start).getLocation();
|
||||
final int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
|
|
@ -676,7 +685,7 @@ public class SlidingWindow {
|
|||
for ( final GATKSAMRecord read : readsInWindow ) {
|
||||
if ( read.getSoftStart() <= refStop ) {
|
||||
if ( read.getAlignmentEnd() >= refStart ) {
|
||||
allReads.add(read);
|
||||
allReads.reads.add(read);
|
||||
removeFromHeader(windowHeader, read);
|
||||
}
|
||||
toRemove.add(read);
|
||||
|
|
@ -687,6 +696,7 @@ public class SlidingWindow {
|
|||
for ( final GATKSAMRecord read : toRemove )
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
|
||||
return allReads;
|
||||
}
|
||||
|
||||
|
|
@ -694,13 +704,13 @@ public class SlidingWindow {
|
|||
* Determines whether the given position match one of the known sites
|
||||
*
|
||||
* @param targetPosition the position of the het site
|
||||
* @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here
|
||||
* @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere)
|
||||
* @return true if the targetPosition matches a known SNP position, false otherwise
|
||||
*/
|
||||
@Requires({"targetPosition >= 1 && knownSnpPositions != null"})
|
||||
protected boolean matchesKnownPosition(final int targetPosition, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
final GenomeLoc targetLoc = new UnvalidatingGenomeLoc(contig, contigIndex, targetPosition, targetPosition);
|
||||
return knownSnpPositions.contains(targetLoc);
|
||||
return knownSnpPositions == null || knownSnpPositions.contains(targetLoc);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -716,7 +726,7 @@ public class SlidingWindow {
|
|||
|
||||
for ( int i = start; i <= stop; i++ ) {
|
||||
|
||||
final int nAlleles = windowHeader.get(i).getNumberOfAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false);
|
||||
final int nAlleles = windowHeader.get(i).getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT);
|
||||
|
||||
// we will only work on diploid non-indel cases because we just don't want to handle/test other scenarios
|
||||
if ( nAlleles > 2 || nAlleles == -1 )
|
||||
|
|
@ -736,21 +746,22 @@ public class SlidingWindow {
|
|||
}
|
||||
|
||||
/*
|
||||
* Checks whether there's a position in the header with a significant number of softclips.
|
||||
* Checks whether there's a position in the header with a significant number of softclips or a variant.
|
||||
*
|
||||
* @param header the window header to examine
|
||||
* @param positionToSkip the global position to skip in the examination (use negative number if you don't want to make use of this argument)
|
||||
* @return true if there exists a position with significant softclips, false otherwise
|
||||
*/
|
||||
@Requires("header != null")
|
||||
protected boolean hasSignificantSoftclipPosition(final List<HeaderElement> header, final int positionToSkip) {
|
||||
protected boolean hasPositionWithSignificantSoftclipsOrVariant(final List<HeaderElement> header, final int positionToSkip) {
|
||||
|
||||
for ( final HeaderElement headerElement : header ) {
|
||||
|
||||
if ( headerElement.getLocation() == positionToSkip )
|
||||
continue;
|
||||
|
||||
if ( headerElement.hasSignificantSoftclips(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT) )
|
||||
if ( headerElement.hasSignificantSoftclips(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) ||
|
||||
headerElement.getNumberOfBaseAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) > 1 )
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
@ -762,29 +773,45 @@ public class SlidingWindow {
|
|||
*
|
||||
* @param start the first window header index in the variant region (inclusive)
|
||||
* @param stop the last window header index of the variant region (inclusive)
|
||||
* @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here
|
||||
* @return a non-null list of all reads contained in the variant region plus any adjacent synthetic reads
|
||||
* @param knownSnpPositions the set of known SNPs used to determine whether to allow polyploid consensus creation here; can be null (to allow polyploid consensus anywhere)
|
||||
* @return a non-null object representing all reads contained in the variant region plus any adjacent synthetic reads
|
||||
*/
|
||||
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
|
||||
@Ensures("result != null")
|
||||
protected ObjectList<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
ObjectList<GATKSAMRecord> allReads = compressVariantRegion(start, stop, knownSnpPositions);
|
||||
protected CloseVariantRegionResult closeVariantRegion(final int start, final int stop, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
final CloseVariantRegionResult allReads = compressVariantRegion(start, stop, knownSnpPositions);
|
||||
|
||||
ObjectList<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
|
||||
result.addAll(addToSyntheticReads(windowHeader, 0, stop+1, SyntheticRead.StrandType.STRANDLESS));
|
||||
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
final CloseVariantRegionResult result = new CloseVariantRegionResult(allReads.stopPerformed);
|
||||
result.reads.addAll(downsampleCoverage > 0 ? downsampleVariantRegion(allReads.reads) : allReads.reads);
|
||||
result.reads.addAll(addToSyntheticReads(windowHeader, 0, allReads.stopPerformed + 1, SyntheticRead.StrandType.STRANDLESS));
|
||||
result.reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
}
|
||||
|
||||
/*
|
||||
* @see #closeVariantRegions(CompressionStash, ObjectSortedSet<GenomeLoc>, boolean) with forceCloseFullRegions set to false
|
||||
*/
|
||||
public ObjectSet<GATKSAMRecord> closeVariantRegions(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
return closeVariantRegions(regions, knownSnpPositions, false);
|
||||
}
|
||||
|
||||
private static final class CloseVariantRegionResult {
|
||||
final private ObjectList<GATKSAMRecord> reads = new ObjectArrayList<GATKSAMRecord>();
|
||||
private int stopPerformed;
|
||||
|
||||
public CloseVariantRegionResult(final int stopPerformed) { this.stopPerformed = stopPerformed; }
|
||||
}
|
||||
|
||||
/*
|
||||
* Finalizes the list of regions requested (and any regions preceding them)
|
||||
*
|
||||
* @param regions the list of regions to finalize
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere)
|
||||
* @param forceCloseFullRegions if true, requires this method to make sure all regions are fully closed; otherwise, we may decide not to close up to the very end (e.g. during het compression)
|
||||
* @return a non-null set of reduced reads representing the finalized regions
|
||||
*/
|
||||
public ObjectSet<GATKSAMRecord> closeVariantRegions(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions) {
|
||||
public ObjectSet<GATKSAMRecord> closeVariantRegions(final CompressionStash regions, final ObjectSortedSet<GenomeLoc> knownSnpPositions, final boolean forceCloseFullRegions) {
|
||||
final ObjectAVLTreeSet<GATKSAMRecord> allReads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
if ( !regions.isEmpty() ) {
|
||||
|
||||
|
|
@ -794,9 +821,33 @@ public class SlidingWindow {
|
|||
for ( final GenomeLoc region : regions ) {
|
||||
if (((FinishedGenomeLoc)region).isFinished() && region.getContig().equals(contig) && region.getStart() >= windowHeaderStart && region.getStop() < windowHeaderStart + windowHeader.size()) {
|
||||
final int start = region.getStart() - windowHeaderStart;
|
||||
final int stop = region.getStop() - windowHeaderStart;
|
||||
int stop = region.getStop() - windowHeaderStart;
|
||||
|
||||
allReads.addAll(closeVariantRegion(start, stop, knownSnpPositions));
|
||||
CloseVariantRegionResult closeVariantRegionResult = closeVariantRegion(start, stop, knownSnpPositions);
|
||||
allReads.addAll(closeVariantRegionResult.reads);
|
||||
|
||||
// check whether we didn't close the whole region that was requested
|
||||
if ( stop > 0 && closeVariantRegionResult.stopPerformed < stop ) {
|
||||
// we should update the variant sites bitset because the context size's worth of bases after the variant position are no longer "variant"
|
||||
markRegionAs(closeVariantRegionResult.stopPerformed + 1, stop, false);
|
||||
|
||||
// if the calling method said that it didn't care then we are okay so update the stop
|
||||
if ( !forceCloseFullRegions ) {
|
||||
stop = closeVariantRegionResult.stopPerformed;
|
||||
}
|
||||
// otherwise, we need to forcibly push the stop that we originally requested
|
||||
else {
|
||||
while ( closeVariantRegionResult.stopPerformed < stop ) {
|
||||
// first clean up used header elements so they don't get reused
|
||||
for ( int i = 0; i <= closeVariantRegionResult.stopPerformed; i++ )
|
||||
windowHeader.remove();
|
||||
stop -= (closeVariantRegionResult.stopPerformed + 1);
|
||||
|
||||
closeVariantRegionResult = closeVariantRegion(0, stop, knownSnpPositions);
|
||||
allReads.addAll(closeVariantRegionResult.reads);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// We need to clean up the window header elements up until the end of the requested region so that they don't get used for future regions.
|
||||
// Note that this cleanup used to happen outside the above for-loop, but that was causing an occasional doubling of the reduced reads
|
||||
|
|
@ -847,7 +898,7 @@ public class SlidingWindow {
|
|||
* regions that still exist regardless of being able to fulfill the
|
||||
* context size requirement in the end.
|
||||
*
|
||||
* @param knownSnpPositions the set of known SNP positions
|
||||
* @param knownSnpPositions the set of known SNP positions; can be null (to allow polyploid consensus anywhere)
|
||||
* @return A non-null set/list of all reads generated
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
|
|
@ -855,12 +906,11 @@ public class SlidingWindow {
|
|||
// mark variant regions
|
||||
ObjectSet<GATKSAMRecord> finalizedReads = new ObjectAVLTreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
CompressionStash regions = new CompressionStash();
|
||||
boolean forceCloseUnfinishedRegions = true;
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
markSites(getStopLocation(windowHeader) + 1);
|
||||
regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), forceCloseUnfinishedRegions);
|
||||
finalizedReads = closeVariantRegions(regions, knownSnpPositions);
|
||||
regions = findVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet(), true);
|
||||
finalizedReads = closeVariantRegions(regions, knownSnpPositions, true);
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), SyntheticRead.StrandType.STRANDLESS));
|
||||
|
|
@ -908,117 +958,105 @@ public class SlidingWindow {
|
|||
}
|
||||
|
||||
// define this so that we can use Java generics below
|
||||
private static class HeaderElementList extends LinkedList<HeaderElement> {}
|
||||
private final static class HeaderElementList extends LinkedList<HeaderElement> {}
|
||||
|
||||
private final static class SingleStrandConsensusData {
|
||||
final HeaderElementList consensus = new HeaderElementList();
|
||||
final ObjectList<GATKSAMRecord> reads = new ObjectArrayList<GATKSAMRecord>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalizes a variant region for point mutations, and any adjacent synthetic reads. Indel sites are not supported.
|
||||
* Finalizes a variant region - and any adjacent synthetic reads - for point mutations (indel sites are not
|
||||
* supported) with polyploid compression.
|
||||
*
|
||||
* @param start the first window header index of the variant region (inclusive)
|
||||
* @param stop the last window header index of the variant region (inclusive)
|
||||
* @param hetRefPosition window header index of the het site; MUST NOT BE AN INDEL SITE!
|
||||
* @return a list of all reads contained in the variant region as a polyploid consensus, or null if not possible
|
||||
* @return a non-null list of all reads contained in the variant region as a polyploid consensus
|
||||
*/
|
||||
@Requires({"start >= 0 && (stop >= start || stop == 0)"})
|
||||
protected ObjectList<GATKSAMRecord> createPolyploidConsensus(final int start, final int stop, final int hetRefPosition) {
|
||||
@Ensures({"result != null"})
|
||||
protected ObjectList<GATKSAMRecord> createPolyploidConsensus(final int hetRefPosition) {
|
||||
// we will create two (positive strand, negative strand) headers for each haplotype
|
||||
final HeaderElementList[] headersPosStrand = new HeaderElementList[2];
|
||||
final HeaderElementList[] headersNegStrand = new HeaderElementList[2];
|
||||
final SingleStrandConsensusData[] headersPosStrand = new SingleStrandConsensusData[2];
|
||||
final SingleStrandConsensusData[] headersNegStrand = new SingleStrandConsensusData[2];
|
||||
|
||||
final int refStart = windowHeader.get(start).getLocation();
|
||||
final int refStop = windowHeader.get(stop).getLocation();
|
||||
final int globalHetRefPosition = windowHeader.get(hetRefPosition).getLocation();
|
||||
|
||||
// initialize the mapping from base (allele) to header
|
||||
final Byte2IntMap alleleHeaderMap = new Byte2IntArrayMap(2);
|
||||
for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, false) ) {
|
||||
for ( final BaseIndex allele : windowHeader.get(hetRefPosition).getAlleles(MIN_ALT_PVALUE_TO_TRIGGER_VARIANT) ) {
|
||||
final int currentIndex = alleleHeaderMap.size();
|
||||
if ( currentIndex > 1 )
|
||||
throw new IllegalStateException("There are more than 2 alleles present when creating a diploid consensus");
|
||||
|
||||
alleleHeaderMap.put(allele.b, currentIndex);
|
||||
headersPosStrand[currentIndex] = new HeaderElementList();
|
||||
headersNegStrand[currentIndex] = new HeaderElementList();
|
||||
headersPosStrand[currentIndex] = new SingleStrandConsensusData();
|
||||
headersNegStrand[currentIndex] = new SingleStrandConsensusData();
|
||||
}
|
||||
|
||||
// sanity check that we saw 2 alleles
|
||||
if ( alleleHeaderMap.size() != 2 )
|
||||
throw new IllegalStateException("We expected to see 2 alleles when creating a diploid consensus but saw " + alleleHeaderMap.size());
|
||||
|
||||
final ObjectList<GATKSAMRecord> toRemoveFromReadCache = new ObjectArrayList<GATKSAMRecord>();
|
||||
final ObjectList<GATKSAMRecord> toRemoveFromHeader = new ObjectArrayList<GATKSAMRecord>();
|
||||
final ObjectList<GATKSAMRecord> readsToRemoveFromHeader = new ObjectArrayList<GATKSAMRecord>();
|
||||
|
||||
for ( final GATKSAMRecord read : readsInWindow ) {
|
||||
|
||||
// if the read falls after the region, just skip it for now (we'll get to it later)
|
||||
if ( read.getSoftStart() > refStop )
|
||||
// if the read falls after the het position, just skip it for now (we'll get to it later)
|
||||
if ( read.getSoftStart() > globalHetRefPosition )
|
||||
continue;
|
||||
|
||||
// if the read falls before the region, remove it
|
||||
if ( read.getSoftEnd() < refStart ) {
|
||||
toRemoveFromReadCache.add(read);
|
||||
continue;
|
||||
}
|
||||
|
||||
// check whether the read spans the het site
|
||||
if ( read.getSoftStart() <= globalHetRefPosition && read.getSoftEnd() >= globalHetRefPosition ) {
|
||||
|
||||
// make sure it meets the minimum mapping quality requirement (if not, we'll remove it and not use it for the consensuses)
|
||||
if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) {
|
||||
|
||||
// where on the read is the het position?
|
||||
final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
|
||||
|
||||
// this is safe because indels are not supported
|
||||
final byte base = read.getReadBases()[readPosOfHet];
|
||||
final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet];
|
||||
|
||||
// make sure that the base passes filters (if not, we'll remove it and not use it for the consensuses)
|
||||
if ( qual >= MIN_BASE_QUAL_TO_COUNT ) {
|
||||
|
||||
// check which allele this read represents
|
||||
final Integer allele = alleleHeaderMap.get(base);
|
||||
|
||||
// ignore the read if it represents a base that's not part of the consensus
|
||||
if ( allele != null ) {
|
||||
// add to the appropriate polyploid header
|
||||
final LinkedList<HeaderElement> header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele];
|
||||
addToHeader(header, read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// remove from the standard header so that we don't double count it
|
||||
toRemoveFromHeader.add(read);
|
||||
}
|
||||
|
||||
// we remove all reads falling inside the variant region from the window
|
||||
toRemoveFromReadCache.add(read);
|
||||
}
|
||||
|
||||
// sanity check that no new "variant region" exists on just a single consensus strand
|
||||
// due to softclips now that we've broken everything out into their component parts
|
||||
for ( final LinkedList<HeaderElement> header : headersPosStrand ) {
|
||||
if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) )
|
||||
return null;
|
||||
}
|
||||
for ( final LinkedList<HeaderElement> header : headersNegStrand ) {
|
||||
if ( hasSignificantSoftclipPosition(header, globalHetRefPosition) )
|
||||
return null;
|
||||
}
|
||||
|
||||
// create the polyploid synthetic reads
|
||||
final ObjectList<GATKSAMRecord> hetReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
for ( final LinkedList<HeaderElement> header : headersPosStrand )
|
||||
finalizeHetConsensus(header, false, hetReads);
|
||||
for ( final LinkedList<HeaderElement> header : headersNegStrand )
|
||||
finalizeHetConsensus(header, true, hetReads);
|
||||
|
||||
// remove all used reads
|
||||
for ( final GATKSAMRecord read : toRemoveFromReadCache )
|
||||
// remove all other reads from the read cache since we're going to use them here
|
||||
readsInWindow.remove(read);
|
||||
for ( final GATKSAMRecord read : toRemoveFromHeader )
|
||||
|
||||
// if the read falls before the het position, we don't need to look at it
|
||||
if ( read.getSoftEnd() < globalHetRefPosition )
|
||||
continue;
|
||||
|
||||
// remove all spanning reads from the consensus header since we're going to incorporate them into a consensus here instead
|
||||
removeFromHeader(windowHeader, read);
|
||||
|
||||
// make sure it meets the minimum mapping quality requirement (if not, we won't use it for the consensus)
|
||||
if ( read.getMappingQuality() >= MIN_MAPPING_QUALITY ) {
|
||||
|
||||
// where on the read is the het position?
|
||||
final int readPosOfHet = ReadUtils.getReadCoordinateForReferenceCoordinate(read, globalHetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
|
||||
|
||||
// this is safe because indels are not supported
|
||||
final byte base = read.getReadBases()[readPosOfHet];
|
||||
final byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPosOfHet];
|
||||
|
||||
// check which allele this read represents
|
||||
final Integer allele = alleleHeaderMap.get(base);
|
||||
|
||||
// ignore the read if it represents a base that's not part of the consensus
|
||||
if ( allele != null ) {
|
||||
// add to the appropriate polyploid header
|
||||
final SingleStrandConsensusData header = read.getReadNegativeStrandFlag() ? headersNegStrand[allele] : headersPosStrand[allele];
|
||||
header.reads.add(read);
|
||||
addToHeader(header.consensus, read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// create the polyploid synthetic reads if we can
|
||||
final ObjectList<GATKSAMRecord> hetReads = new ObjectArrayList<GATKSAMRecord>();
|
||||
|
||||
// sanity check that no new "variant region" exists on just a single consensus strand due to softclips
|
||||
// or multi-allelic sites now that we've broken everything out into their component parts. if one does
|
||||
// exist then we need to back out the consensus for that strand only.
|
||||
for ( final SingleStrandConsensusData header : headersPosStrand ) {
|
||||
if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) )
|
||||
hetReads.addAll(header.reads);
|
||||
else
|
||||
finalizeHetConsensus(header.consensus, false, hetReads);
|
||||
}
|
||||
for ( final SingleStrandConsensusData header : headersNegStrand ) {
|
||||
if ( hasPositionWithSignificantSoftclipsOrVariant(header.consensus, globalHetRefPosition) )
|
||||
hetReads.addAll(header.reads);
|
||||
else
|
||||
finalizeHetConsensus(header.consensus, true, hetReads);
|
||||
}
|
||||
|
||||
return hetReads;
|
||||
}
|
||||
|
||||
|
|
@ -1058,7 +1096,7 @@ public class SlidingWindow {
|
|||
int locationIndex = headerStart < 0 ? 0 : readStart - headerStart;
|
||||
|
||||
if ( removeRead && locationIndex < 0 )
|
||||
throw new ReviewedStingException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header));
|
||||
throw new IllegalStateException("Provided read is behind the Sliding Window! Read = " + read + ", readStart = " + readStart + ", cigar = " + read.getCigarString() + ", window = " + headerStart + "-" + getStopLocation(header));
|
||||
|
||||
// we only need to create new header elements if we are adding the read, not when we're removing it
|
||||
if ( !removeRead )
|
||||
|
|
@ -1138,6 +1176,8 @@ public class SlidingWindow {
|
|||
case H:
|
||||
break;
|
||||
case I:
|
||||
readBaseIndex += cigarElement.getLength();
|
||||
|
||||
// special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it.
|
||||
if ( removeRead && locationIndex == 0 )
|
||||
break;
|
||||
|
|
@ -1150,7 +1190,6 @@ public class SlidingWindow {
|
|||
else
|
||||
headerElement.addInsertionToTheRight();
|
||||
|
||||
readBaseIndex += cigarElement.getLength();
|
||||
break;
|
||||
case D:
|
||||
// deletions are added to the baseCounts with the read mapping quality as it's quality score
|
||||
|
|
|
|||
|
|
@ -55,6 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
|||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
|
@ -99,7 +100,7 @@ public class AssessReducedQuals extends LocusWalker<GenomeLoc, GenomeLoc> implem
|
|||
public int sufficientQualSum = 600;
|
||||
|
||||
@Argument(fullName = "qual_epsilon", shortName = "epsilon", doc = "when |Quals_reduced_bam - Quals_original_bam| > (epsilon * Quals_original_bam) we output this interval", required = false)
|
||||
public double qual_epsilon = 0.25;
|
||||
public double qual_epsilon = 0.10;
|
||||
|
||||
@Output
|
||||
protected PrintStream out;
|
||||
|
|
@ -145,7 +146,7 @@ public class AssessReducedQuals extends LocusWalker<GenomeLoc, GenomeLoc> implem
|
|||
}
|
||||
|
||||
private boolean isGoodRead(final PileupElement p) {
|
||||
return !p.isDeletion() && (int)p.getQual() >= 20 && p.getMappingQual() >= 20;
|
||||
return !p.isDeletion() && (int)p.getQual() >= 15 && p.getMappingQual() >= 20;
|
||||
}
|
||||
|
||||
private int getTagIndex(final List<String> tags) {
|
||||
|
|
|
|||
|
|
@ -179,7 +179,7 @@ public class BaseCountsUnitTest extends BaseTest {
|
|||
BaseCounts counts = new BaseCounts();
|
||||
|
||||
for ( int qual : test.quals )
|
||||
counts.incr(BaseIndex.A, (byte)qual);
|
||||
counts.incr(BaseIndex.A, (byte)qual, 20);
|
||||
|
||||
final int actualSum = (int)counts.getSumQuals((byte)'A');
|
||||
final int expectedSum = qualSum(test.quals);
|
||||
|
|
|
|||
|
|
@ -48,12 +48,12 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
|||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class HeaderElementUnitTest extends BaseTest {
|
||||
|
|
@ -119,16 +119,15 @@ public class HeaderElementUnitTest extends BaseTest {
|
|||
Assert.assertFalse(headerElement.hasFilteredData());
|
||||
Assert.assertFalse(headerElement.hasInsertionToTheRight());
|
||||
Assert.assertTrue(headerElement.isEmpty());
|
||||
Assert.assertEquals(headerElement.getRMS(), 0.0);
|
||||
}
|
||||
|
||||
private void testHeaderData(final HeaderElement headerElement, final HETest test) {
|
||||
Assert.assertEquals(headerElement.getRMS(), (double)test.MQ);
|
||||
Assert.assertEquals(headerElement.isVariantFromSoftClips(), test.isClip);
|
||||
Assert.assertFalse(headerElement.isEmpty());
|
||||
Assert.assertFalse(headerElement.hasInsertionToTheRight());
|
||||
Assert.assertEquals(headerElement.hasConsensusData(), headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual));
|
||||
Assert.assertEquals(headerElement.hasFilteredData(), !headerElement.basePassesFilters(test.baseQual, minBaseQual, test.MQ, minMappingQual));
|
||||
Assert.assertEquals(headerElement.hasConsensusData(), test.MQ >= minMappingQual);
|
||||
Assert.assertEquals(headerElement.hasFilteredData(), test.MQ < minMappingQual);
|
||||
Assert.assertEquals(headerElement.hasConsensusData() ? headerElement.getConsensusBaseCounts().getRMS() : headerElement.getFilteredBaseCounts().getRMS(), (double)test.MQ);
|
||||
Assert.assertFalse(headerElement.isVariantFromMismatches(0.05));
|
||||
Assert.assertEquals(headerElement.isVariant(0.05, 0.05), test.isClip);
|
||||
}
|
||||
|
|
@ -136,13 +135,11 @@ public class HeaderElementUnitTest extends BaseTest {
|
|||
|
||||
private class AllelesTest {
|
||||
public final int[] counts;
|
||||
public final double proportion;
|
||||
public final boolean allowDeletions;
|
||||
public final double pvalue;
|
||||
|
||||
private AllelesTest(final int[] counts, final double proportion, final boolean allowDeletions) {
|
||||
private AllelesTest(final int[] counts, final double pvalue) {
|
||||
this.counts = counts;
|
||||
this.proportion = proportion;
|
||||
this.allowDeletions = allowDeletions;
|
||||
this.pvalue = pvalue;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -151,17 +148,15 @@ public class HeaderElementUnitTest extends BaseTest {
|
|||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final int[] counts = new int[]{ 0, 5, 10, 15, 20 };
|
||||
final double [] proportions = new double[]{ 0.0, 0.05, 0.10, 0.50, 1.0 };
|
||||
final double [] pvalues = new double[]{ 0.0, 0.01, 0.05, 0.20, 1.0 };
|
||||
|
||||
for ( final int countA : counts ) {
|
||||
for ( final int countC : counts ) {
|
||||
for ( final int countG : counts ) {
|
||||
for ( final int countT : counts ) {
|
||||
for ( final int countD : counts ) {
|
||||
for ( final double proportion : proportions ) {
|
||||
for ( final boolean allowDeletions : Arrays.asList(true, false) ) {
|
||||
tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, proportion, allowDeletions)});
|
||||
}
|
||||
for ( final double pvalue : pvalues ) {
|
||||
tests.add(new Object[]{new AllelesTest(new int[]{countA, countC, countG, countT, countD}, pvalue)});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -182,28 +177,33 @@ public class HeaderElementUnitTest extends BaseTest {
|
|||
headerElement.addBase(base.b, byte20, byte10, byte10, byte20, minBaseQual, minMappingQual, false);
|
||||
}
|
||||
|
||||
final int nAllelesSeen = headerElement.getNumberOfAlleles(test.proportion, test.allowDeletions);
|
||||
final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.proportion, test.allowDeletions);
|
||||
final int nAllelesSeen = headerElement.getNumberOfBaseAlleles(test.pvalue);
|
||||
final int nAllelesExpected = calculateExpectedAlleles(test.counts, test.pvalue);
|
||||
|
||||
Assert.assertEquals(nAllelesSeen, nAllelesExpected);
|
||||
}
|
||||
|
||||
private static int calculateExpectedAlleles(final int[] counts, final double proportion, final boolean allowDeletions) {
|
||||
double total = 0.0;
|
||||
private static int calculateExpectedAlleles(final int[] counts, final double targetPvalue) {
|
||||
int total = 0;
|
||||
for ( final int count : counts ) {
|
||||
total += count;
|
||||
}
|
||||
|
||||
final int minCount = Math.max(1, (int)(proportion * total));
|
||||
|
||||
if ( !allowDeletions && counts[BaseIndex.D.index] >= minCount )
|
||||
return -1;
|
||||
|
||||
int result = 0;
|
||||
for ( final int count : counts ) {
|
||||
if ( count > 0 && count >= minCount )
|
||||
for ( int index = 0; index < counts.length; index++ ) {
|
||||
final int count = counts[index];
|
||||
if ( count == 0 )
|
||||
continue;
|
||||
|
||||
final double pvalue = MathUtils.binomialCumulativeProbability(0, count + 1, total, 0.5);
|
||||
|
||||
if ( pvalue > targetPvalue ) {
|
||||
if ( index == BaseIndex.D.index )
|
||||
return -1;
|
||||
result++;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,6 +48,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
|||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -74,10 +75,10 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
final static String emptyFileMd5 = "d41d8cd98f00b204e9800998ecf8427e";
|
||||
|
||||
protected Pair<List<File>, List<String>> executeTest(final String name, final WalkerTestSpec spec) {
|
||||
return executeTest(name, spec, false);
|
||||
return executeTest(name, spec, emptyFileMd5);
|
||||
}
|
||||
|
||||
protected Pair<List<File>, List<String>> executeTest(final String name, final WalkerTestSpec spec, final boolean disableQualsTest) {
|
||||
protected Pair<List<File>, List<String>> executeTest(final String name, final WalkerTestSpec spec, final String qualsTestMD5) {
|
||||
final Pair<List<File>, List<String>> result = super.executeTest(name, spec);
|
||||
|
||||
// perform some Reduce Reads specific testing now
|
||||
|
|
@ -93,15 +94,13 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
reducedInputs.append(file.getAbsolutePath());
|
||||
}
|
||||
|
||||
// run the coverage test
|
||||
final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs);
|
||||
super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5)));
|
||||
// the coverage test is a less stricter version of the quals test so we can safely ignore it for now
|
||||
//final String coverageCommand = createCommandLine("AssessReducedCoverage", originalArgs);
|
||||
//super.executeTest(name + " : COVERAGE_TEST", new WalkerTestSpec(coverageCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5)));
|
||||
|
||||
// run the quals test
|
||||
if ( !disableQualsTest ) {
|
||||
final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs);
|
||||
super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(emptyFileMd5)));
|
||||
}
|
||||
final String qualsCommand = createCommandLine("AssessReducedQuals", originalArgs);
|
||||
super.executeTest(name + " : QUALS_TEST", new WalkerTestSpec(qualsCommand + reducedInputs.toString(), Arrays.asList(qualsTestMD5)));
|
||||
}
|
||||
|
||||
return result;
|
||||
|
|
@ -147,62 +146,69 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
}
|
||||
|
||||
private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns) {
|
||||
this.RRTest(testName, args, md5, useKnowns, false);
|
||||
this.RRTest(testName, args, md5, useKnowns, emptyFileMd5);
|
||||
}
|
||||
|
||||
private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final boolean disableQualsTest) {
|
||||
private void RRTest(final String testName, final String args, final String md5, final boolean useKnowns, final String qualsTestMD5) {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s" + (useKnowns ? " -known " + DBSNP : "") + " ";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList("bam"), Arrays.asList(md5));
|
||||
executeTest(testName, spec, disableQualsTest);
|
||||
executeTest(testName, spec, qualsTestMD5);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testDefaultCompression() {
|
||||
RRTest("testDefaultCompression ", L, "538362abd504200800145720b23c98ce", false);
|
||||
RRTest("testDefaultCompression ", L, "62f8cdb85a424e42e9c56f36302d1dba", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testDefaultCompressionWithKnowns() {
|
||||
RRTest("testDefaultCompressionWithKnowns ", L, "79cdbd997196957af63f46353cff710b", true);
|
||||
RRTest("testDefaultCompressionWithKnowns ", L, "874c0e0a54c3db67f5e9d7c0d45b7844", true);
|
||||
}
|
||||
|
||||
private final String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervals() {
|
||||
RRTest("testMultipleIntervals ", intervals, "6733b25e87e3fce5753cf7936ccf934f", false);
|
||||
RRTest("testMultipleIntervals ", intervals, "2e849f8324b27af36bae8cb9b01722e6", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervalsWithKnowns() {
|
||||
RRTest("testMultipleIntervalsWithKnowns ", intervals, "99e2a79befc71eaadb4197c66a0d6df8", true);
|
||||
RRTest("testMultipleIntervalsWithKnowns ", intervals, "71bc2167cc6916288bd34dcf099feebc", true);
|
||||
}
|
||||
|
||||
final String highCompressionMD5 = "c83256fa2d6785d5188f50dd45c77e0f";
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testHighCompression() {
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "e3b7e14655973c8950d7fec96321e483", false);
|
||||
RRTest("testHighCompression ", " -cs 10 -min_pvalue 0.3 -mindel 0.3 " + L, highCompressionMD5, false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testHighCompressionWithKnowns() {
|
||||
RRTest("testHighCompressionWithKnowns ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "30a7ed079b3a41ed63e520260fa6afe3", true);
|
||||
RRTest("testHighCompressionWithKnowns ", " -cs 10 -min_pvalue 0.3 -mindel 0.3 " + L, highCompressionMD5, true);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLowCompression() {
|
||||
// too much downsampling for quals test
|
||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", false, true);
|
||||
RRTest("testLowCompression ", " -cs 30 -min_pvalue 0.001 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a903558ef284381d74b0ad837deb19f6", false);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLowCompressionWithKnowns() {
|
||||
// too much downsampling for quals test
|
||||
RRTest("testLowCompressionWithKnowns ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "e4cedfcf45cb747e58a7e729eec56de2", true, true);
|
||||
RRTest("testLowCompressionWithKnowns ", " -cs 30 -min_pvalue 0.001 -mindel 0.01 -minmap 5 -minqual 5 " + L, "a4c5aa158c6ebbc703134cbe2d48619c", true);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testBadPvalueInput() {
|
||||
final String cmd = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + "-o %s -min_pvalue -0.01";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(cmd, 1, UserException.BadArgumentValue.class);
|
||||
executeTest("testBadPvalueInput", spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testIndelCompression() {
|
||||
final String md5 = "f58ae2154e0e5716be0e850b7605856e";
|
||||
final String md5 = "56154baed62be07008d3684a0a4c0996";
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, false);
|
||||
RRTest("testIndelCompressionWithKnowns ", " -cs 50 -L 20:10,100,500-10,100,600 ", md5, true);
|
||||
}
|
||||
|
|
@ -210,28 +216,25 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
@Test(enabled = true)
|
||||
public void testFilteredDeletionCompression() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
|
||||
// don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB)
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bfe0693aea74634f1035a9bd11302517")), true);
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("d7655de41d90aecb716f79e32d53b2d1")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCoReduction() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B) + " -o %s ";
|
||||
// don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB)
|
||||
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("930ec2e2c3b62bec7a2425a82c64f022")), true);
|
||||
executeTest("testCoReduction", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fa549ba96ca0ce5fbf3553ba173167e8")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testCoReductionWithKnowns() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s -I %s -known %s", COREDUCTION_L, REF, COREDUCTION_BAM_A, COREDUCTION_BAM_B, DBSNP) + " -o %s ";
|
||||
// don't use quals test here (there's one location with a weird layout that won't pass; signed off by EB)
|
||||
executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("fe7c9fd35e50a828e0f38a7ae25b60a7")), true);
|
||||
executeTest("testCoReductionWithKnowns", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9edcf09b21a4ae8d9fc25222bcb0486b")));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testInsertionsAtEdgeOfConsensus() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, INSERTIONS_AT_EDGE_OF_CONSENSUS_BAM) + " -o %s ";
|
||||
executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4445db7aeddaf2f1d86e1af0cdc74c8")));
|
||||
executeTest("testInsertionsAtEdgeOfConsensus", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("27cc8f1a336b2d0a29855ceb8fc988b0")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -245,7 +248,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
@Test(enabled = true)
|
||||
public void testAddingReadAfterTailingTheStash() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("f118e83c394d21d901a24230379864fc")));
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("34baf99904b676d5f132d3791030ed0a")), "3eab32c215ba68e75efd5ab7e9f7a2e7");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -256,7 +259,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
public void testDivideByZero() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
|
||||
// we expect to lose coverage due to the downsampling so don't run the systematic tests
|
||||
executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("bd5198a3e21034887b741faaaa3964bf")));
|
||||
executeTestWithoutAdditionalRRTests("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("985c4f15a1d45267abb2f6790267930d")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -266,7 +269,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
@Test(enabled = true)
|
||||
public void testReadOffContig() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, OFFCONTIG_BAM) + " -o %s ";
|
||||
executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("b4dc66445ddf5f467f67860bed023ef8")));
|
||||
executeTest("testReadOffContig", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("388ef48791965d637e4bdb45d5d7cf01")));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -276,8 +279,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
public void testPairedReadsInVariantRegion() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", hg19Reference, BOTH_ENDS_OF_PAIR_IN_VARIANT_REGION_BAM) +
|
||||
" -o %s --downsample_coverage 250 -dcov 50 ";
|
||||
// don't use quals test here (there's one location with low quals that won't pass; signed off by EB)
|
||||
executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("9bed260b6245f5ff47db8541405504aa")), true);
|
||||
executeTest("testPairedReadsInVariantRegion", new WalkerTestSpec(base, Arrays.asList("bam"), Arrays.asList("cfa2588f5edf74c5ddf3d190f5ac6f2d")));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -46,9 +46,7 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
|
||||
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.objects.*;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
|
|
@ -58,6 +56,7 @@ import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -198,6 +197,7 @@ public class ReduceReadsUnitTest extends BaseTest {
|
|||
final ReduceReads rr = new ReduceReads();
|
||||
RodBinding.resetNameCounter();
|
||||
rr.known = Arrays.<RodBinding<VariantContext>>asList(new RodBinding(VariantContext.class, "known"));
|
||||
rr.knownSnpPositions = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
final GenomeAnalysisEngine engine = new GenomeAnalysisEngine();
|
||||
engine.setGenomeLocParser(genomeLocParser);
|
||||
|
|
|
|||
|
|
@ -200,17 +200,16 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
@Test(enabled = true)
|
||||
public void testMarkVariantRegion() {
|
||||
final SlidingWindow slidingWindow = new SlidingWindow("1", 0, globalStartPosition);
|
||||
SlidingWindow.MarkedSites markedSites = slidingWindow.new MarkedSites();
|
||||
markedSites.updateRegion(100, 100);
|
||||
slidingWindow.getMarkedSitesForTesting().updateRegion(100, 100);
|
||||
|
||||
slidingWindow.markVariantRegion(markedSites, 40);
|
||||
Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 21);
|
||||
slidingWindow.markVariantRegion(40);
|
||||
Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 21);
|
||||
|
||||
slidingWindow.markVariantRegion(markedSites, 5);
|
||||
Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 37);
|
||||
slidingWindow.markVariantRegion(5);
|
||||
Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 37);
|
||||
|
||||
slidingWindow.markVariantRegion(markedSites, 95);
|
||||
Assert.assertEquals(countTrueBits(markedSites.getVariantSiteBitSet()), 52);
|
||||
slidingWindow.markVariantRegion(95);
|
||||
Assert.assertEquals(countTrueBits(slidingWindow.getMarkedSitesForTesting().getVariantSiteBitSet()), 52);
|
||||
}
|
||||
|
||||
private static int countTrueBits(final boolean[] bitset) {
|
||||
|
|
@ -254,10 +253,12 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
private class ConsensusCreationTest {
|
||||
public final int expectedNumberOfReads, expectedNumberOfReadsWithHetCompression;
|
||||
public final List<GATKSAMRecord> myReads = new ArrayList<GATKSAMRecord>(20);
|
||||
public final String description;
|
||||
|
||||
private ConsensusCreationTest(final List<GenomeLoc> locs, final boolean readsShouldBeLowQuality, final boolean variantBaseShouldBeLowQuality, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) {
|
||||
this.expectedNumberOfReads = expectedNumberOfReads;
|
||||
this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression;
|
||||
this.description = String.format("%d %d", expectedNumberOfReads, expectedNumberOfReadsWithHetCompression);
|
||||
|
||||
// first, add the basic reads to the collection
|
||||
myReads.addAll(basicReads);
|
||||
|
|
@ -270,6 +271,7 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
private ConsensusCreationTest(final List<GenomeLoc> locs, final CigarOperator operator, final int expectedNumberOfReads, final int expectedNumberOfReadsWithHetCompression) {
|
||||
this.expectedNumberOfReads = expectedNumberOfReads;
|
||||
this.expectedNumberOfReadsWithHetCompression = expectedNumberOfReadsWithHetCompression;
|
||||
this.description = String.format("%s %d %d", operator.toString(), expectedNumberOfReads, expectedNumberOfReadsWithHetCompression);
|
||||
|
||||
// first, add the basic reads to the collection
|
||||
myReads.addAll(basicReads);
|
||||
|
|
@ -279,6 +281,8 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
myReads.add(createVariantRead(loc, false, false, operator));
|
||||
}
|
||||
|
||||
public String toString() { return description; }
|
||||
|
||||
private GATKSAMRecord createVariantRead(final GenomeLoc loc, final boolean readShouldBeLowQuality,
|
||||
final boolean variantBaseShouldBeLowQuality, final CigarOperator operator) {
|
||||
|
||||
|
|
@ -315,7 +319,6 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
private static final GenomeLoc loc295 = new UnvalidatingGenomeLoc("1", 0, 1000295, 1000295);
|
||||
private static final GenomeLoc loc309 = new UnvalidatingGenomeLoc("1", 0, 1000309, 1000309);
|
||||
private static final GenomeLoc loc310 = new UnvalidatingGenomeLoc("1", 0, 1000310, 1000310);
|
||||
private static final GenomeLoc loc312 = new UnvalidatingGenomeLoc("1", 0, 1000312, 1000312);
|
||||
private static final GenomeLoc loc1100 = new UnvalidatingGenomeLoc("1", 0, 1001100, 1001100);
|
||||
|
||||
@DataProvider(name = "ConsensusCreation")
|
||||
|
|
@ -328,7 +331,6 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc295), false, false, 10, 10)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc309), false, false, 10, 10)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc310), false, false, 11, 11)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290, loc312), false, false, 11, 8)});
|
||||
|
||||
// test low quality reads
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(), true, false, 1, 1)});
|
||||
|
|
@ -346,7 +348,7 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
|
||||
// test mixture
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc1100), true, false, 2, 2)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc1100), false, true, 3, 3)});
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc1100), false, true, 1, 1)});
|
||||
|
||||
// test I/D operators
|
||||
tests.add(new Object[]{new ConsensusCreationTest(Arrays.<GenomeLoc>asList(loc290), CigarOperator.D, 9, 9)});
|
||||
|
|
@ -370,17 +372,22 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReads);
|
||||
|
||||
// test WITH het compression
|
||||
// test WITH het compression at KNOWN sites
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
for ( int i = 0; i < 1200; i++ )
|
||||
knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i));
|
||||
result = slidingWindow.close(knownSNPs);
|
||||
Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression);
|
||||
|
||||
// test WITH het compression at ALL sites
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : test.myReads )
|
||||
slidingWindow.addRead(read);
|
||||
result = slidingWindow.close(null);
|
||||
Assert.assertEquals(result.getFirst().size(), test.expectedNumberOfReadsWithHetCompression);
|
||||
}
|
||||
|
||||
|
|
@ -405,21 +412,26 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
final ObjectAVLTreeSet<GenomeLoc> knownSNPs = new ObjectAVLTreeSet<GenomeLoc>();
|
||||
|
||||
// test WITHOUT het compression
|
||||
SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
SlidingWindow slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
Pair<ObjectSet<GATKSAMRecord>, CompressionStash> result = slidingWindow.close(knownSNPs); // currently empty
|
||||
|
||||
Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
|
||||
|
||||
// test WITH het compression
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.05, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
// test WITH het compression at KNOWN sites
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
for ( int i = 0; i < readLength; i++ )
|
||||
knownSNPs.add(new UnvalidatingGenomeLoc("1", 0, globalStartPosition + i, globalStartPosition + i));
|
||||
result = slidingWindow.close(knownSNPs);
|
||||
Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
|
||||
|
||||
// test WITH het compression at ALL sites
|
||||
slidingWindow = new SlidingWindow("1", 0, 10, header, new GATKSAMReadGroupRecord("test"), 0, 0.1, 0.05, 20, 20, 100, ReduceReads.DownsampleStrategy.Normal, false);
|
||||
for ( final GATKSAMRecord read : myReads )
|
||||
slidingWindow.addRead(read);
|
||||
result = slidingWindow.close(knownSNPs);
|
||||
Assert.assertEquals(result.getFirst().size(), totalNumReads); // no compression at all
|
||||
}
|
||||
|
||||
|
|
@ -692,7 +704,7 @@ public class SlidingWindowUnitTest extends BaseTest {
|
|||
slidingWindow.actuallyUpdateHeaderForRead(windowHeader, softclippedRead, false, indexWithSoftclips);
|
||||
}
|
||||
|
||||
final boolean result = slidingWindow.hasSignificantSoftclipPosition(windowHeader, currentHeaderStart + indexToSkip);
|
||||
final boolean result = slidingWindow.hasPositionWithSignificantSoftclipsOrVariant(windowHeader, currentHeaderStart + indexToSkip);
|
||||
Assert.assertEquals(result, indexWithSoftclips != -1 && indexWithSoftclips != indexToSkip);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -329,8 +329,8 @@ public class MathUtils {
|
|||
/**
|
||||
* Performs the cumulative sum of binomial probabilities, where the probability calculation is done in log space.
|
||||
*
|
||||
* @param start - start of the cumulant sum (over hits)
|
||||
* @param end - end of the cumulant sum (over hits)
|
||||
* @param start - start (inclusive) of the cumulant sum (over hits)
|
||||
* @param end - end (exclusive) of the cumulant sum (over hits)
|
||||
* @param total - number of attempts for the number of hits
|
||||
* @param probHit - probability of a successful hit
|
||||
* @return - returns the cumulative probability
|
||||
|
|
|
|||
Loading…
Reference in New Issue