Merge branch 'develop' of github.com:broadinstitute/cmi-gatk into develop
This commit is contained in:
commit
9bb241f06f
|
|
@ -577,6 +577,7 @@
|
|||
docletpathref="doclet.classpath"
|
||||
classpathref="external.dependencies"
|
||||
classpath="${java.classes}"
|
||||
maxmemory="2g"
|
||||
additionalparam="-build-timestamp "${build.timestamp}" -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet">
|
||||
<sourcefiles>
|
||||
<union>
|
||||
|
|
@ -780,6 +781,7 @@
|
|||
docletpathref="doclet.classpath"
|
||||
classpathref="external.dependencies"
|
||||
classpath="${java.classes}"
|
||||
maxmemory="2g"
|
||||
additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp "${build.timestamp}" -absolute-version ${build.version} -quiet"> <!-- -test to only do DocumentationTest walker -->
|
||||
<sourcefiles>
|
||||
<fileset refid="java.source.files"/>
|
||||
|
|
@ -1177,7 +1179,7 @@
|
|||
|
||||
<!-- copy the report to our private_html directory for easy viewing in a broswer -->
|
||||
<mkdir dir="${iwww.report.dir}/@{testtype}"/>
|
||||
<copy todir="${iwww.report.dir}/@{testtype}" verbose="true">
|
||||
<copy todir="${iwww.report.dir}/@{testtype}" verbose="false">
|
||||
<fileset dir="@{outputdir}"/>
|
||||
</copy>
|
||||
|
||||
|
|
|
|||
4
ivy.xml
4
ivy.xml
|
|
@ -78,8 +78,8 @@
|
|||
<dependency org="net.sf.gridscheduler" name="drmaa" rev="latest.integration"/>
|
||||
|
||||
<!-- Scala dependancies -->
|
||||
<dependency org="org.scala-lang" name="scala-compiler" rev="2.8.1"/>
|
||||
<dependency org="org.scala-lang" name="scala-library" rev="2.8.1"/>
|
||||
<dependency org="org.scala-lang" name="scala-compiler" rev="2.9.2"/>
|
||||
<dependency org="org.scala-lang" name="scala-library" rev="2.9.2"/>
|
||||
|
||||
<!-- testing and evaluation dependencies -->
|
||||
<dependency org="org.testng" name="testng" rev="5.14.1" conf="test"/>
|
||||
|
|
|
|||
|
|
@ -34,17 +34,20 @@ import org.broadinstitute.sting.utils.recalibration.EventType;
|
|||
import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource {
|
||||
|
||||
// optimizations: don't reallocate an array each time
|
||||
private byte[] tempQualArray;
|
||||
private boolean[] tempErrorArray;
|
||||
private double[] tempFractionalErrorArray;
|
||||
|
||||
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) {
|
||||
super.initialize(covariates, recalibrationTables);
|
||||
tempQualArray = new byte[EventType.values().length];
|
||||
tempErrorArray = new boolean[EventType.values().length];
|
||||
tempFractionalErrorArray = new double[EventType.values().length];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -56,6 +59,7 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
|
|||
* @param pileupElement The pileup element to update
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
@Override
|
||||
public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) {
|
||||
final int offset = pileupElement.getOffset();
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead());
|
||||
|
|
@ -73,15 +77,15 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
|
|||
final byte qual = tempQualArray[eventIndex];
|
||||
final boolean isError = tempErrorArray[eventIndex];
|
||||
|
||||
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
|
||||
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getReadGroupTable();
|
||||
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
|
||||
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
|
||||
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
|
||||
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
|
||||
rgRecalTable.put(rgThisDatum, keys[0], eventIndex);
|
||||
else
|
||||
rgPreviousDatum.combine(rgThisDatum);
|
||||
|
||||
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE);
|
||||
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getQualityScoreTable();
|
||||
final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex);
|
||||
if (qualPreviousDatum == null)
|
||||
qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex);
|
||||
|
|
@ -100,4 +104,53 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
|
||||
for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
|
||||
if( !skip[offset] ) {
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(read);
|
||||
|
||||
tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset];
|
||||
tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset];
|
||||
tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset];
|
||||
tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset];
|
||||
|
||||
for (final EventType eventType : EventType.values()) {
|
||||
final int[] keys = readCovariates.getKeySet(offset, eventType);
|
||||
final int eventIndex = eventType.index;
|
||||
final byte qual = tempQualArray[eventIndex];
|
||||
final double isError = tempFractionalErrorArray[eventIndex];
|
||||
|
||||
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getReadGroupTable();
|
||||
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
|
||||
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
|
||||
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
|
||||
rgRecalTable.put(rgThisDatum, keys[0], eventIndex);
|
||||
else
|
||||
rgPreviousDatum.combine(rgThisDatum);
|
||||
|
||||
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getQualityScoreTable();
|
||||
final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex);
|
||||
if (qualPreviousDatum == null)
|
||||
qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex);
|
||||
else
|
||||
qualPreviousDatum.increment(1.0, isError);
|
||||
|
||||
for (int i = 2; i < covariates.length; i++) {
|
||||
if (keys[i] < 0)
|
||||
continue;
|
||||
final NestedIntegerArray<RecalDatum> covRecalTable = recalibrationTables.getTable(i);
|
||||
final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex);
|
||||
if (covPreviousDatum == null)
|
||||
covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex);
|
||||
else
|
||||
covPreviousDatum.increment(1.0, isError);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public void incr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
super.incr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
|
|
@ -32,7 +32,7 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public void decr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
|
||||
super.decr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
|
|
@ -41,16 +41,15 @@ public class BaseAndQualsCounts extends BaseCounts {
|
|||
}
|
||||
}
|
||||
|
||||
public byte averageInsertionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumInsertionQuals);
|
||||
public byte averageInsertionQualsOfBase(final BaseIndex base) {
|
||||
return getGenericAverageQualOfBase(base, sumInsertionQuals);
|
||||
}
|
||||
|
||||
public byte averageDeletionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumDeletionQuals);
|
||||
public byte averageDeletionQualsOfBase(final BaseIndex base) {
|
||||
return getGenericAverageQualOfBase(base, sumDeletionQuals);
|
||||
}
|
||||
|
||||
private byte getGenericAverageQualOfMostCommonBase(Map<BaseIndex, Long> sumQuals) {
|
||||
BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts());
|
||||
private byte getGenericAverageQualOfBase(final BaseIndex base, final Map<BaseIndex, Long> sumQuals) {
|
||||
return (byte) (sumQuals.get(base) / getCount(base));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,7 +7,7 @@ import java.util.EnumMap;
|
|||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An object to keep track of the number of occurences of each base and it's quality.
|
||||
* An object to keep track of the number of occurrences of each base and it's quality.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 4/8/11
|
||||
|
|
@ -41,26 +41,26 @@ import java.util.Map;
|
|||
|
||||
@Requires("other != null")
|
||||
public void add(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
for (final BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) + other.counts.get(i));
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void sub(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
for (final BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) - other.counts.get(i));
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
sumQuals.put(i, sumQuals.get(i) + qual);
|
||||
|
|
@ -69,69 +69,63 @@ import java.util.Map;
|
|||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
final BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
sumQuals.put(i, sumQuals.get(i) - qual);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(byte base) {
|
||||
public int getCount(final byte base) {
|
||||
return getCount(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(BaseIndex base) {
|
||||
public int getCount(final BaseIndex base) {
|
||||
return counts.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(byte base) {
|
||||
public long getSumQuals(final byte base) {
|
||||
return getSumQuals(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(BaseIndex base) {
|
||||
public long getSumQuals(final BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(byte base) {
|
||||
public byte averageQuals(final byte base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(BaseIndex base) {
|
||||
public byte averageQuals(final BaseIndex base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return baseIndexWithMostCounts().getByte();
|
||||
@Ensures("result >= 0")
|
||||
public int countOfBase(final BaseIndex base) {
|
||||
return counts.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfMostCommonBase() {
|
||||
return counts.get(baseIndexWithMostCounts());
|
||||
public long sumQualsOfBase(final BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long sumQualsOfMostCommonBase() {
|
||||
return sumQuals.get(baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQualsOfMostCommonBase() {
|
||||
return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase());
|
||||
public byte averageQualsOfBase(final BaseIndex base) {
|
||||
return (byte) (sumQualsOfBase(base) / countOfBase(base));
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -151,7 +145,7 @@ import java.util.Map;
|
|||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(byte base) {
|
||||
public double baseCountProportion(final byte base) {
|
||||
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
|
||||
}
|
||||
|
||||
|
|
@ -162,7 +156,7 @@ import java.util.Map;
|
|||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(BaseIndex baseIndex) {
|
||||
public double baseCountProportion(final BaseIndex baseIndex) {
|
||||
int total = totalCount();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
|
|
@ -179,30 +173,66 @@ import java.util.Map;
|
|||
return b.toString();
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return baseIndexWithMostCounts().getByte();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCounts() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex i : counts.keySet())
|
||||
if (counts.get(i) > counts.get(maxI))
|
||||
maxI = i;
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
|
||||
if (entry.getValue() > counts.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
|
||||
BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex index : counts.keySet())
|
||||
if (index.isNucleotide() && counts.get(index) > counts.get(mostCounts))
|
||||
mostCounts = index;
|
||||
return mostCounts;
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet()) {
|
||||
if (entry.getKey().isNucleotide() && entry.getValue() > counts.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return maxI;
|
||||
}
|
||||
|
||||
private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) {
|
||||
final int targetCount = counts.get(targetIndex);
|
||||
final int testCount = counts.get(testIndex);
|
||||
return ( targetCount > testCount || (targetCount == testCount && sumQuals.get(targetIndex) > sumQuals.get(testIndex)) );
|
||||
}
|
||||
|
||||
public byte baseWithMostProbability() {
|
||||
return baseIndexWithMostProbability().getByte();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbability() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
|
||||
if (entry.getValue() > sumQuals.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (Map.Entry<BaseIndex, Long> entry : sumQuals.entrySet()) {
|
||||
if (entry.getKey().isNucleotide() && entry.getValue() > sumQuals.get(maxI))
|
||||
maxI = entry.getKey();
|
||||
}
|
||||
return (sumQuals.get(maxI) > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
|
||||
}
|
||||
|
||||
@Ensures("result >=0")
|
||||
public int totalCountWithoutIndels() {
|
||||
int sum = 0;
|
||||
for (BaseIndex index : counts.keySet())
|
||||
if (index.isNucleotide())
|
||||
sum += counts.get(index);
|
||||
for (Map.Entry<BaseIndex, Integer> entry : counts.entrySet())
|
||||
if (entry.getKey().isNucleotide())
|
||||
sum += entry.getValue();
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
|
@ -214,10 +244,14 @@ import java.util.Map;
|
|||
*/
|
||||
@Requires("index.isNucleotide()")
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportionWithoutIndels(BaseIndex index) {
|
||||
int total = totalCountWithoutIndels();
|
||||
public double baseCountProportionWithoutIndels(final BaseIndex index) {
|
||||
final int total = totalCountWithoutIndels();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
return (double) counts.get(index) / totalCountWithoutIndels();
|
||||
}
|
||||
|
||||
public Object[] countsArray() {
|
||||
return counts.values().toArray();
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
|||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
|
|
@ -181,7 +182,7 @@ public class HeaderElement {
|
|||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromMismatches(double minVariantProportion) {
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels();
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
|
||||
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
|
||||
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
|
||||
}
|
||||
|
|
@ -200,5 +201,28 @@ public class HeaderElement {
|
|||
return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the number of haplotypes necessary to represent this site.
|
||||
*
|
||||
* @param minVariantProportion the minimum proportion to call a site variant.
|
||||
* @return the number of haplotypes necessary to represent this site.
|
||||
*/
|
||||
public int getNumberOfHaplotypes(double minVariantProportion) {
|
||||
int nHaplotypes = 0;
|
||||
int totalCount = consensusBaseCounts.totalCount();
|
||||
int runningCount = 0;
|
||||
|
||||
if (totalCount == 0)
|
||||
return 0;
|
||||
|
||||
Object[] countsArray = consensusBaseCounts.countsArray();
|
||||
Arrays.sort(countsArray);
|
||||
for (int i = countsArray.length-1; i>=0; i--) {
|
||||
nHaplotypes++;
|
||||
runningCount += (Integer) countsArray[i];
|
||||
if (runningCount/totalCount > minVariantProportion)
|
||||
break;
|
||||
}
|
||||
return nHaplotypes;
|
||||
}
|
||||
}
|
||||
|
|
@ -53,11 +53,13 @@ public class MultiSampleCompressor implements Compressor {
|
|||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||
final int nContigs,
|
||||
final boolean allowPolyploidReduction) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleCompressor(name, contextSize, downsampleCoverage,
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
new SingleSampleCompressor(contextSize, downsampleCoverage,
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
|||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionType;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
|
|
@ -52,23 +52,23 @@ import java.util.*;
|
|||
|
||||
/**
|
||||
* Reduces the BAM file using read based compression that keeps only essential information for variant calling
|
||||
* <p/>
|
||||
*
|
||||
* <p>
|
||||
* This walker will generated reduced versions of the BAM files that still follow the BAM spec
|
||||
* and contain all the information necessary for the GSA variant calling pipeline. Some options
|
||||
* allow you to tune in how much compression you want to achieve. The default values have been
|
||||
* shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
|
||||
* savings in file size and performance of the downstream tools.
|
||||
* <p/>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The BAM file to be compressed
|
||||
* </p>
|
||||
* <p/>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* The compressed (reduced) BAM file.
|
||||
* </p>
|
||||
*
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
|
|
@ -86,13 +86,13 @@ import java.util.*;
|
|||
public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {
|
||||
|
||||
@Output
|
||||
protected StingSAMFileWriter out;
|
||||
private StingSAMFileWriter out;
|
||||
|
||||
/**
|
||||
* The number of bases to keep around mismatches (potential variation)
|
||||
*/
|
||||
@Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
|
||||
protected int contextSize = 10;
|
||||
private int contextSize = 10;
|
||||
|
||||
/**
|
||||
* The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
|
||||
|
|
@ -100,7 +100,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
|
||||
protected int minMappingQuality = 20;
|
||||
private int minMappingQuality = 20;
|
||||
|
||||
/**
|
||||
* The minimum base quality to be considered for the consensus synthetic read. Reads that have
|
||||
|
|
@ -108,35 +108,41 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
|
||||
protected byte minBaseQual = 20;
|
||||
private byte minBaseQual = 20;
|
||||
|
||||
/**
|
||||
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
|
||||
* lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
|
||||
*/
|
||||
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
|
||||
protected byte minTailQuality = 2;
|
||||
private byte minTailQuality = 2;
|
||||
|
||||
/**
|
||||
* Allow the experimental polyploid-based reduction capabilities of this tool
|
||||
*/
|
||||
@Argument(fullName = "allow_polyploid_reduction", shortName = "polyploid", doc = "", required = false)
|
||||
private boolean USE_POLYPLOID_REDUCTION = false;
|
||||
|
||||
/**
|
||||
* Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
|
||||
* and read group).
|
||||
*/
|
||||
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
|
||||
protected boolean DONT_SIMPLIFY_READS = false;
|
||||
private boolean DONT_SIMPLIFY_READS = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
|
||||
* The program will behave correctly in those cases.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
|
||||
protected boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
|
||||
private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
|
||||
* quality.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
|
||||
protected boolean DONT_CLIP_LOW_QUAL_TAILS = false;
|
||||
private boolean DONT_CLIP_LOW_QUAL_TAILS = false;
|
||||
|
||||
/**
|
||||
* Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
|
||||
|
|
@ -144,7 +150,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
|
||||
*/
|
||||
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
|
||||
protected boolean DONT_USE_SOFTCLIPPED_BASES = false;
|
||||
private boolean DONT_USE_SOFTCLIPPED_BASES = false;
|
||||
|
||||
/**
|
||||
* Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
|
||||
|
|
@ -152,47 +158,55 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
|
||||
*/
|
||||
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
|
||||
protected boolean DONT_COMPRESS_READ_NAMES = false;
|
||||
private boolean DONT_COMPRESS_READ_NAMES = false;
|
||||
|
||||
/**
|
||||
* Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
|
||||
* border.
|
||||
*/
|
||||
@Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
|
||||
protected boolean HARD_CLIP_TO_INTERVAL = false;
|
||||
private boolean HARD_CLIP_TO_INTERVAL = false;
|
||||
|
||||
/**
|
||||
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
*/
|
||||
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
|
||||
protected double minAltProportionToTriggerVariant = 0.05;
|
||||
private double minAltProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
*/
|
||||
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
|
||||
protected double minIndelProportionToTriggerVariant = 0.05;
|
||||
private double minIndelProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
|
||||
* A value of 0 turns downsampling off.
|
||||
*/
|
||||
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
|
||||
protected int downsampleCoverage = 250;
|
||||
private int downsampleCoverage = 250;
|
||||
|
||||
/**
|
||||
* Number of chromossomes in the sample (this is used for the polyploid consensus compression). Only
|
||||
* tested for humans (or organisms with n=2). Use at your own risk!
|
||||
*/
|
||||
@Hidden
|
||||
@Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false)
|
||||
private int nContigs = 2;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
|
||||
protected int debugLevel = 0;
|
||||
private int debugLevel = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dr", doc = "", required = false)
|
||||
protected String debugRead = "";
|
||||
private String debugRead = "";
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
|
||||
protected DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
|
||||
private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
|
||||
|
|
@ -203,7 +217,6 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
Adaptive
|
||||
}
|
||||
|
||||
protected int totalReads = 0;
|
||||
int nCompressedReads = 0;
|
||||
|
||||
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
|
|
@ -247,16 +260,15 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
* @return a linked list with all the reads produced by the clipping operations
|
||||
*/
|
||||
@Override
|
||||
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
LinkedList<GATKSAMRecord> mappedReads;
|
||||
totalReads++;
|
||||
if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
|
||||
System.out.println("Found debug read!");
|
||||
|
||||
if (debugLevel == 1)
|
||||
System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd());
|
||||
|
||||
// we write the actual alignment starts to their respectiv alignment shift tags in the temporary
|
||||
// we write the actual alignment starts to their respective alignment shift tags in the temporary
|
||||
// attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file
|
||||
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart());
|
||||
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd());
|
||||
|
|
@ -316,7 +328,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
*/
|
||||
@Override
|
||||
public ReduceReadsStash reduceInit() {
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -532,8 +544,6 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
|
|||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start)
|
||||
if (endShift > 0)
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end)
|
||||
|
||||
totalReads++;
|
||||
}
|
||||
|
||||
if (debugLevel == 1)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
|
@ -8,35 +7,33 @@ import java.util.TreeSet;
|
|||
|
||||
/**
|
||||
*
|
||||
* @author depristo
|
||||
* @version 0.1
|
||||
* @author carneiro, depristo
|
||||
* @version 3.0
|
||||
*/
|
||||
public class SingleSampleCompressor implements Compressor {
|
||||
protected static final Logger logger = Logger.getLogger(SingleSampleCompressor.class);
|
||||
final private int contextSize;
|
||||
final private int downsampleCoverage;
|
||||
final private int minMappingQuality;
|
||||
final private double minAltProportionToTriggerVariant;
|
||||
final private double minIndelProportionToTriggerVariant;
|
||||
final private int minBaseQual;
|
||||
final private ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
final private int nContigs;
|
||||
final private boolean allowPolyploidReduction;
|
||||
|
||||
protected final int contextSize;
|
||||
protected final int downsampleCoverage;
|
||||
protected int minMappingQuality;
|
||||
protected int slidingWindowCounter;
|
||||
private SlidingWindow slidingWindow;
|
||||
private int slidingWindowCounter;
|
||||
|
||||
protected final String sampleName;
|
||||
|
||||
protected SlidingWindow slidingWindow;
|
||||
protected double minAltProportionToTriggerVariant;
|
||||
protected double minIndelProportionToTriggerVariant;
|
||||
protected int minBaseQual;
|
||||
|
||||
protected ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
|
||||
public SingleSampleCompressor(final String sampleName,
|
||||
final int contextSize,
|
||||
public SingleSampleCompressor(final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
this.sampleName = sampleName;
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy,
|
||||
final int nContigs,
|
||||
final boolean allowPolyploidReduction) {
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
this.minMappingQuality = minMappingQuality;
|
||||
|
|
@ -45,6 +42,8 @@ public class SingleSampleCompressor implements Compressor {
|
|||
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
|
||||
this.minBaseQual = minBaseQual;
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
this.nContigs = nContigs;
|
||||
this.allowPolyploidReduction = allowPolyploidReduction;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -66,7 +65,7 @@ public class SingleSampleCompressor implements Compressor {
|
|||
}
|
||||
|
||||
if ( slidingWindow == null) { // this is the first read
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction);
|
||||
slidingWindowCounter++;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -8,14 +8,12 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -29,10 +27,9 @@ public class SlidingWindow {
|
|||
final private LinkedList<GATKSAMRecord> readsInWindow;
|
||||
final private LinkedList<HeaderElement> windowHeader;
|
||||
protected int contextSize; // the largest context size (between mismatches and indels)
|
||||
protected int stopLocation;
|
||||
protected String contig;
|
||||
protected int contigIndex;
|
||||
protected SAMFileHeader header;
|
||||
protected SAMFileHeader samHeader;
|
||||
protected GATKSAMReadGroupRecord readGroupAttribute;
|
||||
protected int downsampleCoverage;
|
||||
|
||||
|
|
@ -56,6 +53,10 @@ public class SlidingWindow {
|
|||
protected ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
private boolean hasIndelQualities;
|
||||
|
||||
private final int nContigs;
|
||||
|
||||
private boolean allowPolyploidReductionInGeneral;
|
||||
|
||||
/**
|
||||
* The types of synthetic reads to use in the finalizeAndAdd method
|
||||
*/
|
||||
|
|
@ -66,7 +67,11 @@ public class SlidingWindow {
|
|||
}
|
||||
|
||||
public int getStopLocation() {
|
||||
return stopLocation;
|
||||
return getStopLocation(windowHeader);
|
||||
}
|
||||
|
||||
private int getStopLocation(LinkedList<HeaderElement> header) {
|
||||
return getStartLocation(header) + header.size() - 1;
|
||||
}
|
||||
|
||||
public String getContig() {
|
||||
|
|
@ -77,13 +82,12 @@ public class SlidingWindow {
|
|||
return contigIndex;
|
||||
}
|
||||
|
||||
public int getStartLocation() {
|
||||
return windowHeader.isEmpty() ? -1 : windowHeader.peek().getLocation();
|
||||
public int getStartLocation(LinkedList<HeaderElement> header) {
|
||||
return header.isEmpty() ? -1 : header.peek().getLocation();
|
||||
}
|
||||
|
||||
|
||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) {
|
||||
this.stopLocation = -1;
|
||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) {
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
|
||||
|
|
@ -97,7 +101,7 @@ public class SlidingWindow {
|
|||
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
this.header = header;
|
||||
this.samHeader = samHeader;
|
||||
this.readGroupAttribute = readGroupAttribute;
|
||||
|
||||
this.consensusCounter = 0;
|
||||
|
|
@ -111,6 +115,9 @@ public class SlidingWindow {
|
|||
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.nContigs = nContigs;
|
||||
|
||||
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -125,7 +132,7 @@ public class SlidingWindow {
|
|||
* @return a list of reads that have been finished by sliding the window.
|
||||
*/
|
||||
public List<GATKSAMRecord> addRead(GATKSAMRecord read) {
|
||||
updateHeaderCounts(read, false); // update the window header counts
|
||||
addToHeader(windowHeader, read); // update the window header counts
|
||||
readsInWindow.add(read); // add read to sliding reads
|
||||
return slideWindow(read.getUnclippedStart());
|
||||
}
|
||||
|
|
@ -191,17 +198,18 @@ public class SlidingWindow {
|
|||
protected List<GATKSAMRecord> slideWindow(int incomingReadUnclippedStart) {
|
||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
if (incomingReadUnclippedStart - contextSize > getStartLocation()) {
|
||||
int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation();
|
||||
boolean[] variantSite = markSites(getStartLocation() + readStartHeaderIndex);
|
||||
if (incomingReadUnclippedStart - contextSize > getStartLocation(windowHeader)) {
|
||||
int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation(windowHeader);
|
||||
boolean[] variantSite = markSites(getStartLocation(windowHeader) + readStartHeaderIndex);
|
||||
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
||||
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, variantSite);
|
||||
finalizedReads = closeVariantRegions(regions, false);
|
||||
|
||||
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
||||
if (read.getAlignmentEnd() < getStartLocation()) {
|
||||
final int windowHeaderStartLoc = getStartLocation(windowHeader);
|
||||
for (final GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
||||
if (read.getSoftEnd() < windowHeaderStartLoc) {
|
||||
readsToRemove.add(read);
|
||||
}
|
||||
}
|
||||
|
|
@ -222,15 +230,15 @@ public class SlidingWindow {
|
|||
*/
|
||||
protected boolean[] markSites(int stop) {
|
||||
|
||||
boolean[] markedSites = new boolean[stop - getStartLocation() + contextSize + 1];
|
||||
boolean[] markedSites = new boolean[stop - getStartLocation(windowHeader) + contextSize + 1];
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.iterator();
|
||||
for (int i = getStartLocation(); i < stop; i++) {
|
||||
for (int i = getStartLocation(windowHeader); i < stop; i++) {
|
||||
if (headerElementIterator.hasNext()) {
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
|
||||
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
|
||||
markVariantRegion(markedSites, i - getStartLocation());
|
||||
markVariantRegion(markedSites, i - getStartLocation(windowHeader));
|
||||
|
||||
} else
|
||||
break;
|
||||
|
|
@ -260,46 +268,45 @@ public class SlidingWindow {
|
|||
* @param end the first header index NOT TO add to consensus
|
||||
* @return a list of consensus reads generated by this call. Empty list if no consensus was generated.
|
||||
*/
|
||||
protected List<GATKSAMRecord> addToSyntheticReads(int start, int end) {
|
||||
protected List<GATKSAMRecord> addToSyntheticReads(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
LinkedList<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
|
||||
if (start < end) {
|
||||
|
||||
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
|
||||
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end));
|
||||
throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, header.size(), end));
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
|
||||
if (headerElement.hasConsensusData()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
|
||||
|
||||
int endOfConsensus = findNextNonConsensusElement(start, end);
|
||||
addToRunningConsensus(start, endOfConsensus);
|
||||
int endOfConsensus = findNextNonConsensusElement(header, start, end);
|
||||
addToRunningConsensus(header, start, endOfConsensus, isNegativeStrand);
|
||||
|
||||
if (endOfConsensus <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(endOfConsensus, end));
|
||||
reads.addAll(addToSyntheticReads(header, endOfConsensus, end, isNegativeStrand));
|
||||
} else if (headerElement.hasFilteredData()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
|
||||
|
||||
int endOfFilteredData = findNextNonFilteredDataElement(start, end);
|
||||
addToFilteredData(start, endOfFilteredData);
|
||||
int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
|
||||
reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
|
||||
|
||||
if (endOfFilteredData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(endOfFilteredData, end));
|
||||
reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, isNegativeStrand));
|
||||
} else if (headerElement.isEmpty()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
int endOfEmptyData = findNextNonEmptyElement(start, end);
|
||||
int endOfEmptyData = findNextNonEmptyElement(header, start, end);
|
||||
|
||||
if (endOfEmptyData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(endOfEmptyData, end));
|
||||
reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, isNegativeStrand));
|
||||
} else
|
||||
throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
|
||||
|
||||
|
|
@ -343,8 +350,8 @@ public class SlidingWindow {
|
|||
* @param upTo limit to search for another consensus element
|
||||
* @return next position with consensus data or empty
|
||||
*/
|
||||
private int findNextNonConsensusElement(int start, int upTo) {
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
private int findNextNonConsensusElement(LinkedList<HeaderElement> header, int start, int upTo) {
|
||||
Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
|
||||
int index = start;
|
||||
while (index < upTo) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
|
|
@ -365,8 +372,8 @@ public class SlidingWindow {
|
|||
* @param upTo limit to search for
|
||||
* @return next position with no filtered data
|
||||
*/
|
||||
private int findNextNonFilteredDataElement(int start, int upTo) {
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
private int findNextNonFilteredDataElement(LinkedList<HeaderElement> header, int start, int upTo) {
|
||||
Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
|
||||
int index = start;
|
||||
while (index < upTo) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
|
|
@ -387,8 +394,8 @@ public class SlidingWindow {
|
|||
* @param upTo limit to search for
|
||||
* @return next position with non-empty element
|
||||
*/
|
||||
private int findNextNonEmptyElement(int start, int upTo) {
|
||||
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
private int findNextNonEmptyElement(LinkedList<HeaderElement> header, int start, int upTo) {
|
||||
ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
|
||||
int index = start;
|
||||
while (index < upTo) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
|
|
@ -412,11 +419,13 @@ public class SlidingWindow {
|
|||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
*/
|
||||
private void addToFilteredData(int start, int end) {
|
||||
if (filteredDataConsensus == null)
|
||||
filteredDataConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
|
||||
private List<GATKSAMRecord> addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
List<GATKSAMRecord> result = new ArrayList<GATKSAMRecord>(0);
|
||||
|
||||
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
if (filteredDataConsensus == null)
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
|
||||
|
||||
ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
|
||||
for (int index = start; index < end; index++) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist");
|
||||
|
|
@ -428,8 +437,15 @@ public class SlidingWindow {
|
|||
if (!headerElement.hasFilteredData())
|
||||
throw new ReviewedStingException("No filtered data in " + index);
|
||||
|
||||
if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
|
||||
result.add(finalizeFilteredDataConsensus());
|
||||
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
|
||||
}
|
||||
|
||||
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -441,11 +457,11 @@ public class SlidingWindow {
|
|||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
*/
|
||||
private void addToRunningConsensus(int start, int end) {
|
||||
private void addToRunningConsensus(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
|
||||
if (runningConsensus == null)
|
||||
runningConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
|
||||
runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
|
||||
for (int index = start; index < end; index++) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
|
||||
|
|
@ -466,14 +482,76 @@ public class SlidingWindow {
|
|||
* @param rms the rms mapping quality in the header element
|
||||
*/
|
||||
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
|
||||
BaseIndex base = baseCounts.baseIndexWithMostCounts();
|
||||
byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfMostCommonBase();
|
||||
byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
|
||||
byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
|
||||
final BaseIndex base = baseCounts.baseIndexWithMostProbability();
|
||||
byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfBase(base);
|
||||
byte insQual = baseCounts.averageInsertionQualsOfBase(base);
|
||||
byte delQual = baseCounts.averageDeletionQualsOfBase(base);
|
||||
syntheticRead.add(base, count, qual, insQual, delQual, rms);
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
// Try to compress into a polyploid consensus
|
||||
int nHaplotypes = 0;
|
||||
int hetRefPosition = -1;
|
||||
boolean canCompress = true;
|
||||
boolean foundEvent = false;
|
||||
Object[] header = windowHeader.toArray();
|
||||
|
||||
// foundEvent will remain false if we don't allow polyploid reduction
|
||||
if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
|
||||
for (int i = start; i<=stop; i++) {
|
||||
nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
|
||||
if (nHaplotypes > nContigs) {
|
||||
canCompress = false;
|
||||
break;
|
||||
}
|
||||
|
||||
// guarantees that there is only 1 site in the variant region that needs more than one haplotype
|
||||
if (nHaplotypes > 1) {
|
||||
if (!foundEvent) {
|
||||
foundEvent = true;
|
||||
hetRefPosition = i;
|
||||
}
|
||||
else {
|
||||
canCompress = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Try to compress the variant region
|
||||
// the "foundEvent" protects us from trying to compress variant regions that are created by insertions
|
||||
if (canCompress && foundEvent) {
|
||||
allReads = createPolyploidConsensus(start, stop, nHaplotypes, ((HeaderElement) header[hetRefPosition]).getLocation());
|
||||
}
|
||||
|
||||
// Return all reads that overlap the variant region and remove them from the window header entirely
|
||||
// also remove all reads preceding the variant region (since they will be output as consensus right after compression
|
||||
else {
|
||||
final int refStart = windowHeader.get(start).getLocation();
|
||||
final int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
LinkedList<GATKSAMRecord> toRemove = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) {
|
||||
if (read.getSoftStart() <= refStop) {
|
||||
if (read.getAlignmentEnd() >= refStart) {
|
||||
allReads.add(read);
|
||||
removeFromHeader(windowHeader, read);
|
||||
}
|
||||
toRemove.add(read);
|
||||
}
|
||||
}
|
||||
for (GATKSAMRecord read : toRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
}
|
||||
return allReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalizes a variant region, any adjacent synthetic reads.
|
||||
*
|
||||
|
|
@ -482,27 +560,13 @@ public class SlidingWindow {
|
|||
* @return all reads contained in the variant region plus any adjacent synthetic reads
|
||||
*/
|
||||
@Requires("start <= stop")
|
||||
protected List<GATKSAMRecord> closeVariantRegion(int start, int stop) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
int refStart = windowHeader.get(start).getLocation(); // All operations are reference based, not read based
|
||||
int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
for (GATKSAMRecord read : readsInWindow) { // Keep all reads that overlap the variant region
|
||||
if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) {
|
||||
allReads.add(read);
|
||||
updateHeaderCounts(read, true); // Remove this read from the window header entirely
|
||||
}
|
||||
}
|
||||
protected List<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
|
||||
List<GATKSAMRecord> allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
|
||||
|
||||
List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
|
||||
result.addAll(addToSyntheticReads(0, start));
|
||||
result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
|
||||
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
for (GATKSAMRecord read : allReads) {
|
||||
readsInWindow.remove(read); // todo -- not optimal, but needs to be done so the next region doesn't try to remove the same reads from the header counts.
|
||||
}
|
||||
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
}
|
||||
|
||||
|
|
@ -517,7 +581,7 @@ public class SlidingWindow {
|
|||
if (stop < 0 && forceClose)
|
||||
stop = windowHeader.size() - 1;
|
||||
if (stop >= 0) {
|
||||
allReads.addAll(closeVariantRegion(start, stop));
|
||||
allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
|
||||
lastStop = stop;
|
||||
}
|
||||
}
|
||||
|
|
@ -545,7 +609,7 @@ public class SlidingWindow {
|
|||
|
||||
ReservoirDownsampler <GATKSAMRecord> downsampler = new ReservoirDownsampler<GATKSAMRecord>(downsampleCoverage);
|
||||
downsampler.submit(allReads);
|
||||
return downsampler.consumeDownsampledItems();
|
||||
return downsampler.consumeFinalizedItems();
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -561,12 +625,12 @@ public class SlidingWindow {
|
|||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
boolean[] variantSite = markSites(stopLocation + 1);
|
||||
boolean[] variantSite = markSites(getStopLocation(windowHeader) + 1);
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), variantSite);
|
||||
finalizedReads = closeVariantRegions(regions, true);
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
finalizedReads.addAll(addToSyntheticReads(0, windowHeader.size() - 1));
|
||||
finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
|
||||
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
|
||||
}
|
||||
|
||||
|
|
@ -611,13 +675,96 @@ public class SlidingWindow {
|
|||
}
|
||||
|
||||
|
||||
|
||||
private List<GATKSAMRecord> createPolyploidConsensus(int start, int stop, int nHaplotypes, int hetRefPosition) {
|
||||
// we will create two (positive strand, negative strand) headers for each contig
|
||||
List<LinkedList<HeaderElement>> headersPosStrand = new ArrayList<LinkedList<HeaderElement>>();
|
||||
List<LinkedList<HeaderElement>> headersNegStrand = new ArrayList<LinkedList<HeaderElement>>();
|
||||
List<GATKSAMRecord> hetReads = new LinkedList<GATKSAMRecord>();
|
||||
Map<Byte, Integer> haplotypeHeaderMap = new HashMap<Byte, Integer>(nHaplotypes);
|
||||
int currentHaplotype = 0;
|
||||
int refStart = windowHeader.get(start).getLocation();
|
||||
int refStop = windowHeader.get(stop).getLocation();
|
||||
List<GATKSAMRecord> toRemove = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) {
|
||||
int haplotype;
|
||||
|
||||
// check if the read is either before or inside the variant region
|
||||
if (read.getSoftStart() <= refStop) {
|
||||
// check if the read is inside the variant region
|
||||
if (read.getMappingQuality() >= MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) {
|
||||
// check if the read contains the het site
|
||||
if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) {
|
||||
int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
|
||||
byte base = read.getReadBases()[readPos];
|
||||
byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos];
|
||||
|
||||
// check if base passes the filters!
|
||||
if (qual >= MIN_BASE_QUAL_TO_COUNT) {
|
||||
// check which haplotype this read represents and take the index of it from the list of headers
|
||||
if (haplotypeHeaderMap.containsKey(base)) {
|
||||
haplotype = haplotypeHeaderMap.get(base);
|
||||
}
|
||||
// create new lists if this haplotype has not been seen yet
|
||||
else {
|
||||
haplotype = currentHaplotype;
|
||||
haplotypeHeaderMap.put(base, currentHaplotype);
|
||||
headersPosStrand.add(new LinkedList<HeaderElement>());
|
||||
headersNegStrand.add(new LinkedList<HeaderElement>());
|
||||
currentHaplotype++;
|
||||
}
|
||||
LinkedList<HeaderElement> header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype);
|
||||
// add to the polyploid header
|
||||
addToHeader(header, read);
|
||||
// remove from the standard header so that we don't double count it
|
||||
removeFromHeader(windowHeader, read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// we remove all reads before and inside the variant region from the window
|
||||
toRemove.add(read);
|
||||
}
|
||||
}
|
||||
|
||||
for (LinkedList<HeaderElement> header : headersPosStrand) {
|
||||
if (header.size() > 0)
|
||||
hetReads.addAll(addToSyntheticReads(header, 0, header.size(), false));
|
||||
if (runningConsensus != null)
|
||||
hetReads.add(finalizeRunningConsensus());
|
||||
}
|
||||
for (LinkedList<HeaderElement> header : headersNegStrand) {
|
||||
if (header.size() > 0)
|
||||
hetReads.addAll(addToSyntheticReads(header, 0, header.size(), true));
|
||||
if (runningConsensus != null)
|
||||
hetReads.add(finalizeRunningConsensus());
|
||||
}
|
||||
|
||||
for (GATKSAMRecord read : toRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
return hetReads;
|
||||
}
|
||||
|
||||
|
||||
private void addToHeader(LinkedList<HeaderElement> header, GATKSAMRecord read) {
|
||||
updateHeaderCounts(header, read, false);
|
||||
}
|
||||
|
||||
private void removeFromHeader(LinkedList<HeaderElement> header, GATKSAMRecord read) {
|
||||
updateHeaderCounts(header, read, true);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Updates the sliding window's header counts with the incoming read bases, insertions
|
||||
* and deletions.
|
||||
*
|
||||
* @param header the sliding window header to use
|
||||
* @param read the incoming read to be added to the sliding window
|
||||
* @param removeRead if we are removing the read from the header or adding
|
||||
*/
|
||||
protected void updateHeaderCounts(GATKSAMRecord read, boolean removeRead) {
|
||||
private void updateHeaderCounts(LinkedList<HeaderElement> header, GATKSAMRecord read, boolean removeRead) {
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
byte[] insQuals = read.getExistingBaseInsertionQualities();
|
||||
|
|
@ -627,8 +774,9 @@ public class SlidingWindow {
|
|||
Cigar cigar = read.getCigar();
|
||||
|
||||
int readBaseIndex = 0;
|
||||
int startLocation = getStartLocation();
|
||||
int startLocation = getStartLocation(header);
|
||||
int locationIndex = startLocation < 0 ? 0 : readStart - startLocation;
|
||||
int stopLocation = getStopLocation(header);
|
||||
|
||||
if (removeRead && locationIndex < 0)
|
||||
throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " start " + read.getUnclippedStart() + "," + read.getUnclippedEnd() + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation);
|
||||
|
|
@ -636,7 +784,7 @@ public class SlidingWindow {
|
|||
if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it
|
||||
if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window
|
||||
for (int i = 1; i <= -locationIndex; i++)
|
||||
windowHeader.addFirst(new HeaderElement(startLocation - i));
|
||||
header.addFirst(new HeaderElement(startLocation - i));
|
||||
|
||||
startLocation = readStart; // update start location accordingly
|
||||
locationIndex = 0;
|
||||
|
|
@ -645,19 +793,17 @@ public class SlidingWindow {
|
|||
if (stopLocation < readEnd) { // Do we need to add extra elements to the header?
|
||||
int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation;
|
||||
while (elementsToAdd-- > 0)
|
||||
windowHeader.addLast(new HeaderElement(readEnd - elementsToAdd));
|
||||
|
||||
stopLocation = readEnd; // update stopLocation accordingly
|
||||
header.addLast(new HeaderElement(readEnd - elementsToAdd));
|
||||
}
|
||||
|
||||
// Special case for leading insertions before the beginning of the sliding read
|
||||
if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) {
|
||||
windowHeader.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added
|
||||
header.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added
|
||||
locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing.
|
||||
}
|
||||
}
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(locationIndex);
|
||||
Iterator<HeaderElement> headerElementIterator = header.listIterator(locationIndex);
|
||||
HeaderElement headerElement;
|
||||
for (CigarElement cigarElement : cigar.getCigarElements()) {
|
||||
switch (cigarElement.getOperator()) {
|
||||
|
|
@ -668,7 +814,7 @@ public class SlidingWindow {
|
|||
break;
|
||||
}
|
||||
|
||||
headerElement = windowHeader.get(locationIndex - 1); // insertions are added to the base to the left (previous element)
|
||||
headerElement = header.get(locationIndex - 1); // insertions are added to the base to the left (previous element)
|
||||
|
||||
if (removeRead) {
|
||||
headerElement.removeInsertionToTheRight();
|
||||
|
|
|
|||
|
|
@ -5,9 +5,9 @@ import net.sf.samtools.Cigar;
|
|||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.recalibration.EventType;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
|
@ -44,8 +44,9 @@ public class SyntheticRead {
|
|||
private String contig;
|
||||
private int contigIndex;
|
||||
private String readName;
|
||||
private Integer refStart;
|
||||
private int refStart;
|
||||
private boolean hasIndelQualities = false;
|
||||
private boolean isNegativeStrand = false;
|
||||
|
||||
/**
|
||||
* Full initialization of the running consensus if you have all the information and are ready to
|
||||
|
|
@ -59,7 +60,7 @@ public class SyntheticRead {
|
|||
* @param refStart the alignment start (reference based)
|
||||
* @param readTag the reduce reads tag for the synthetic read
|
||||
*/
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities) {
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
final int initialCapacity = 10000;
|
||||
bases = new ArrayList<BaseIndex>(initialCapacity);
|
||||
counts = new ArrayList<Byte>(initialCapacity);
|
||||
|
|
@ -76,9 +77,10 @@ public class SyntheticRead {
|
|||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.isNegativeStrand = isNegativeRead;
|
||||
}
|
||||
|
||||
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities) {
|
||||
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
|
||||
this.bases = bases;
|
||||
this.counts = counts;
|
||||
this.quals = quals;
|
||||
|
|
@ -93,6 +95,7 @@ public class SyntheticRead {
|
|||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
this.isNegativeStrand = isNegativeRead;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -112,11 +115,15 @@ public class SyntheticRead {
|
|||
this.mappingQuality += mappingQuality;
|
||||
}
|
||||
|
||||
public BaseIndex getBase(int readCoordinate) {
|
||||
public BaseIndex getBase(final int readCoordinate) {
|
||||
return bases.get(readCoordinate);
|
||||
}
|
||||
|
||||
/**
|
||||
public int getRefStart() {
|
||||
return refStart;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
|
||||
*
|
||||
* Invalid reads are :
|
||||
|
|
@ -133,6 +140,7 @@ public class SyntheticRead {
|
|||
read.setReferenceIndex(contigIndex);
|
||||
read.setReadPairedFlag(false);
|
||||
read.setReadUnmappedFlag(false);
|
||||
read.setReadNegativeStrandFlag(isNegativeStrand);
|
||||
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
|
||||
read.setAlignmentStart(refStart);
|
||||
read.setReadName(readName);
|
||||
|
|
|
|||
|
|
@ -53,13 +53,14 @@ public class ErrorModel {
|
|||
|
||||
PairHMMIndelErrorModel pairModel = null;
|
||||
LinkedHashMap<Allele, Haplotype> haplotypeMap = null;
|
||||
HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap = null;
|
||||
double[][] perReadLikelihoods = null;
|
||||
|
||||
double[] model = new double[maxQualityScore+1];
|
||||
Arrays.fill(model,Double.NEGATIVE_INFINITY);
|
||||
|
||||
boolean hasCalledAlleles = false;
|
||||
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
if (refSampleVC != null) {
|
||||
|
||||
for (Allele allele : refSampleVC.getAlleles()) {
|
||||
|
|
@ -72,7 +73,6 @@ public class ErrorModel {
|
|||
if (refSampleVC.isIndel()) {
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
indelLikelihoodMap = new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
|
||||
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
|
||||
}
|
||||
}
|
||||
|
|
@ -92,12 +92,12 @@ public class ErrorModel {
|
|||
|
||||
Allele refAllele = refSampleVC.getReference();
|
||||
|
||||
if (refSampleVC.isIndel()) {
|
||||
if ( refSampleVC.isIndel()) {
|
||||
final int readCounts[] = new int[refSamplePileup.getNumberOfElements()];
|
||||
//perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
|
||||
final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
|
||||
if (!haplotypeMap.isEmpty())
|
||||
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts);
|
||||
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap, readCounts);
|
||||
}
|
||||
int idx = 0;
|
||||
for (PileupElement refPileupElement : refSamplePileup) {
|
||||
|
|
@ -195,8 +195,8 @@ public class ErrorModel {
|
|||
if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength)
|
||||
return true;
|
||||
|
||||
if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
|
||||
Arrays.equals(pileupElement.getEventBases().getBytes(),alleleBases))
|
||||
if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
|
||||
Arrays.equals(pileupElement.getEventBases().getBytes(),Arrays.copyOfRange(alleleBases,1,alleleBases.length))) // allele contains ref byte, but pileupElement's event bases doesn't
|
||||
return true;
|
||||
|
||||
return false;
|
||||
|
|
|
|||
|
|
@ -26,6 +26,8 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACcounts;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -123,7 +125,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
*
|
||||
*
|
||||
*/
|
||||
protected static class SumIterator {
|
||||
public static class SumIterator {
|
||||
private int[] currentState;
|
||||
private final int[] finalState;
|
||||
private final int restrictSumTo;
|
||||
|
|
@ -491,32 +493,32 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
|
||||
// and we repeat until queue is empty
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(likelihoodDim);
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[nAlleles];
|
||||
zeroCounts[0] = numChromosomes;
|
||||
|
||||
AlleleFrequencyCalculationModel.ExactACset zeroSet =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
|
||||
ExactACset zeroSet =
|
||||
new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
indexesToACset.remove(ACset.getACcounts());
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -525,13 +527,13 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
int plIdx = 0;
|
||||
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
AlleleFrequencyCalculationModel.ExactACset ACset =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
|
||||
ExactACset ACset =
|
||||
new ExactACset(1, new ExactACcounts(iterator.getCurrentVector()));
|
||||
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
|
||||
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
|
||||
|
||||
setLogPLs(plIdx++, ACset.log10Likelihoods[0]);
|
||||
setLogPLs(plIdx++, ACset.getLog10Likelihoods()[0]);
|
||||
iterator.next();
|
||||
}
|
||||
}
|
||||
|
|
@ -540,40 +542,40 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
|
||||
}
|
||||
|
||||
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
|
||||
private double calculateACConformationAndUpdateQueue(final ExactACset set,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final double maxLog10L,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
|
||||
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts,
|
||||
ExactACset> indexesToACset,
|
||||
final ReadBackedPileup pileup) {
|
||||
// compute likelihood of set
|
||||
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
|
||||
final double log10LofK = set.log10Likelihoods[0];
|
||||
final double log10LofK = set.getLog10Likelihoods()[0];
|
||||
|
||||
// log result in PL vector
|
||||
int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes);
|
||||
int idx = getLinearIndex(set.getACcounts().getCounts(), nAlleles, numChromosomes);
|
||||
setLogPLs(idx, log10LofK);
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0];
|
||||
final int ACwiggle = numChromosomes - set.getACsum() + set.getACcounts().getCounts()[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < nAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
|
|
@ -597,7 +599,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
* @param numObservations Number of observations for each allele
|
||||
* @param pileup Read backed pileup in case it's necessary
|
||||
*/
|
||||
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public abstract void getLikelihoodOfConformation(final ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
|
|
@ -608,12 +610,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
|
|||
|
||||
// Static methods
|
||||
public static void updateACset(final int[] newSetCounts,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
|
||||
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
|
||||
final ExactACcounts index = new ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
|
||||
ExactACset newSet = new ExactACset(1, index);
|
||||
indexesToACset.put(index, newSet);
|
||||
ACqueue.add(newSet);
|
||||
if (VERBOSE)
|
||||
|
|
|
|||
|
|
@ -41,15 +41,6 @@ import java.util.*;
|
|||
|
||||
public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
//protected Set<String> laneIDs;
|
||||
public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
POOLSNP,
|
||||
POOLINDEL,
|
||||
BOTH
|
||||
}
|
||||
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
protected GeneralPloidyGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
|
|
@ -203,7 +194,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
|
|||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser) {
|
||||
final GenomeLocParser locParser,
|
||||
final Map<String,PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
|
||||
|
||||
HashMap<String, ErrorModel> perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts);
|
||||
if (perLaneErrorModels == null && UAC.referenceSampleName != null)
|
||||
|
|
@ -215,8 +207,11 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
|
|||
newContext.put(DUMMY_SAMPLE_NAME,mergedContext);
|
||||
contexts = newContext;
|
||||
}
|
||||
|
||||
// get initial alleles to genotype
|
||||
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
|
||||
// starting a new site: clear allele list
|
||||
perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods
|
||||
}
|
||||
// get initial alleles to genotype
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>();
|
||||
if (allAllelesToUse == null || allAllelesToUse.isEmpty())
|
||||
allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse));
|
||||
|
|
@ -234,9 +229,13 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
|
|||
continue;
|
||||
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){
|
||||
// no likelihoods have been computed for this sample at this site
|
||||
perReadAlleleLikelihoodMap.put(sample.getKey(), new PerReadAlleleLikelihoodMap());
|
||||
}
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO);
|
||||
final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO, perReadAlleleLikelihoodMap.get(sample.getKey()));
|
||||
// actually compute likelihoods
|
||||
final int nGoodBases = GL.add(pileup, UAC);
|
||||
if ( nGoodBases > 0 )
|
||||
|
|
@ -333,7 +332,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
|
|||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation);
|
||||
final boolean ignoreLaneInformation,
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap);
|
||||
|
||||
protected abstract List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
|
@ -26,6 +27,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
double[][] readHaplotypeLikelihoods;
|
||||
|
||||
final byte refBase;
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap;
|
||||
|
||||
public GeneralPloidyIndelGenotypeLikelihoods(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
|
|
@ -34,7 +36,8 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
final boolean ignoreLaneInformation,
|
||||
final PairHMMIndelErrorModel pairModel,
|
||||
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
|
||||
final ReferenceContext referenceContext) {
|
||||
final ReferenceContext referenceContext,
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) {
|
||||
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
|
||||
this.pairModel = pairModel;
|
||||
this.haplotypeMap = haplotypeMap;
|
||||
|
|
@ -42,6 +45,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
|
||||
// todo - not needed if indel alleles have base at current position
|
||||
this.refBase = referenceContext.getBase();
|
||||
this.perReadAlleleLikelihoodMap = perReadAlleleLikelihoodMap;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
|
@ -142,8 +146,9 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
List<Integer> numSeenBases = new ArrayList<Integer>(this.alleles.size());
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
|
||||
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap, readCounts);
|
||||
n = readHaplotypeLikelihoods.length;
|
||||
} else {
|
||||
Allele refAllele = null;
|
||||
|
|
@ -184,12 +189,12 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public void getLikelihoodOfConformation(final ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size());
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size());
|
||||
double p1 = 0.0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
|
|
@ -214,6 +219,6 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
|
|||
}
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
ACset.getLog10Likelihoods()[0] = p1;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -73,8 +73,9 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
|
|||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation){
|
||||
return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref);
|
||||
final boolean ignoreLaneInformation,
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){
|
||||
return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref, perReadAlleleLikelihoodMap);
|
||||
}
|
||||
|
||||
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
|
|
@ -90,7 +91,6 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
|
|||
if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE)
|
||||
alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE);
|
||||
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
|
||||
IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear();
|
||||
haplotypeMap.clear();
|
||||
}
|
||||
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap);
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
|
|||
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
|
|
@ -12,7 +13,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
|||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
|
@ -218,12 +222,12 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
public void getLikelihoodOfConformation(final ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length);
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), BaseUtils.BASES.length);
|
||||
final int[] ac = new int[BaseUtils.BASES.length];
|
||||
|
||||
for (int k=0; k < BaseUtils.BASES.length; k++ )
|
||||
|
|
@ -238,9 +242,9 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
final byte qual = qualToUse(elt, true, true, mbq);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
final double acc[] = new double[ACset.ACcounts.counts.length];
|
||||
final double acc[] = new double[ACset.getACcounts().getCounts().length];
|
||||
for (int k=0; k < acc.length; k++ )
|
||||
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]]
|
||||
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.getACcounts().getCounts()[k]]
|
||||
- LOG10_PLOIDY;
|
||||
p1 += MathUtils.log10sumLog10(acc);
|
||||
}
|
||||
|
|
@ -264,7 +268,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
|
|||
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
ACset.getLog10Likelihoods()[0] = p1;
|
||||
/* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1));
|
||||
System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ)));
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -49,7 +49,8 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General
|
|||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation) {
|
||||
final boolean ignoreLaneInformation,
|
||||
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){
|
||||
return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,277 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.log4j.ConsoleAppender;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.log4j.TTCCLayout;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReport;
|
||||
import org.broadinstitute.sting.gatk.report.GATKReportTable;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 10/2/12
|
||||
* Time: 10:25 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class AFCalcPerformanceTest {
|
||||
final static Logger logger = Logger.getLogger(AFCalcPerformanceTest.class);
|
||||
|
||||
private static abstract class Analysis {
|
||||
final GATKReport report;
|
||||
|
||||
public Analysis(final String name, final List<String> columns) {
|
||||
report = GATKReport.newSimpleReport(name, columns);
|
||||
}
|
||||
|
||||
public abstract void run(final AFCalcTestBuilder testBuilder,
|
||||
final List<Object> coreColumns);
|
||||
|
||||
public String getName() {
|
||||
return getTable().getTableName();
|
||||
}
|
||||
|
||||
public GATKReportTable getTable() {
|
||||
return report.getTables().iterator().next();
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeByACAndPL extends Analysis {
|
||||
public AnalyzeByACAndPL(final List<String> columns) {
|
||||
super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac"));
|
||||
}
|
||||
|
||||
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final AFCalc calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) {
|
||||
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
|
||||
|
||||
timer.start();
|
||||
final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
int otherAC = 0;
|
||||
int nAltSeg = 0;
|
||||
for ( int i = 0; i < ACs.length; i++ ) {
|
||||
nAltSeg += ACs[i] > 0 ? 1 : 0;
|
||||
if ( i > 0 ) otherAC += ACs[i];
|
||||
}
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<int[]> makeACs(final int nAltAlleles, final int nChrom) {
|
||||
if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3");
|
||||
|
||||
final List<int[]> ACs = new LinkedList<int[]>();
|
||||
|
||||
final List<Integer> ACsToTry = MathUtils.log10LinearRange(0, nChrom, 0.1); //Arrays.asList(0, 1, 2, 3, 6, 10, 20, 40, 60, 100, 200, 400, 600, 1000, 2000, 4000, 6000, 10000, 100000);
|
||||
|
||||
for ( int i : ACsToTry ) {
|
||||
if ( i < nChrom ) {
|
||||
if ( nAltAlleles == 1 ) {
|
||||
ACs.add(new int[]{i});
|
||||
} else if ( nAltAlleles == 2 ) {
|
||||
for ( int j : ACsToTry ) {
|
||||
if ( j < nChrom - i )
|
||||
ACs.add(new int[]{i, j});
|
||||
}
|
||||
} else {
|
||||
throw new IllegalStateException("cannot get here");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ACs;
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeBySingletonPosition extends Analysis {
|
||||
public AnalyzeBySingletonPosition(final List<String> columns) {
|
||||
super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton"));
|
||||
}
|
||||
|
||||
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final AFCalc calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
final int[] ac = new int[testBuilder.numAltAlleles];
|
||||
ac[0] = 1;
|
||||
final VariantContext vc = testBuilder.makeACTest(ac, 0, nonTypePL);
|
||||
|
||||
for ( final int position : MathUtils.log10LinearRange(0, vc.getNSamples(), 0.1) ) {
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
|
||||
final List<Genotype> genotypes = new ArrayList<Genotype>(vc.getGenotypes());
|
||||
Collections.rotate(genotypes, position);
|
||||
vcb.genotypes(genotypes);
|
||||
|
||||
timer.start();
|
||||
final AFCalcResult resultTracker = calc.getLog10PNonRef(vcb.make(), priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, position));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class AnalyzeByNonInformative extends Analysis {
|
||||
public AnalyzeByNonInformative(final List<String> columns) {
|
||||
super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative"));
|
||||
}
|
||||
|
||||
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
|
||||
final SimpleTimer timer = new SimpleTimer();
|
||||
|
||||
for ( final int nonTypePL : Arrays.asList(100) ) {
|
||||
final AFCalc calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
|
||||
final int[] ac = new int[testBuilder.numAltAlleles];
|
||||
ac[0] = 1;
|
||||
|
||||
for ( int nNonInformative = 0; nNonInformative < testBuilder.nSamples; nNonInformative++ ) {
|
||||
final VariantContext vc = testBuilder.makeACTest(ac, nNonInformative, nonTypePL);
|
||||
|
||||
timer.start();
|
||||
final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
|
||||
final List<Object> columns = new LinkedList<Object>(coreValues);
|
||||
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, nNonInformative));
|
||||
report.addRowList(columns);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static class ModelParams {
|
||||
final AFCalcFactory.Calculation modelType;
|
||||
final int maxBiNSamples, maxTriNSamples;
|
||||
|
||||
private ModelParams(AFCalcFactory.Calculation modelType, int maxBiNSamples, int maxTriNSamples) {
|
||||
this.modelType = modelType;
|
||||
this.maxBiNSamples = maxBiNSamples;
|
||||
this.maxTriNSamples = maxTriNSamples;
|
||||
}
|
||||
|
||||
public boolean meetsConstraints(final int nAltAlleles, final int nSamples) {
|
||||
if ( nAltAlleles == 1 )
|
||||
return nSamples <= maxBiNSamples;
|
||||
else if ( nAltAlleles == 2 )
|
||||
return nSamples <= maxTriNSamples;
|
||||
else
|
||||
throw new IllegalStateException("Unexpected number of alt alleles " + nAltAlleles);
|
||||
}
|
||||
}
|
||||
|
||||
public enum Operation {
|
||||
ANALYZE,
|
||||
SINGLE
|
||||
}
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final TTCCLayout layout = new TTCCLayout();
|
||||
layout.setThreadPrinting(false);
|
||||
layout.setCategoryPrefixing(false);
|
||||
layout.setContextPrinting(false);
|
||||
logger.addAppender(new ConsoleAppender(layout));
|
||||
|
||||
final Operation op = Operation.valueOf(args[0]);
|
||||
|
||||
switch ( op ) {
|
||||
case ANALYZE: analyze(args); break;
|
||||
case SINGLE: profileBig(args); break;
|
||||
default: throw new IllegalAccessException("unknown operation " + op);
|
||||
}
|
||||
}
|
||||
|
||||
private static void profileBig(final String[] args) throws Exception {
|
||||
final int nSamples = Integer.valueOf(args[1]);
|
||||
final int ac = Integer.valueOf(args[2]);
|
||||
|
||||
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(nSamples, 1,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
|
||||
AFCalcTestBuilder.PriorType.human);
|
||||
|
||||
final VariantContext vc = testBuilder.makeACTest(new int[]{ac}, 0, 100);
|
||||
|
||||
final SimpleTimer timer = new SimpleTimer().start();
|
||||
final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vc, testBuilder.makePriors());
|
||||
final long runtime = timer.getElapsedTimeNano();
|
||||
logger.info("result " + resultTracker.getLog10PosteriorOfAFGT0());
|
||||
logger.info("runtime " + runtime);
|
||||
}
|
||||
|
||||
private static void analyze(final String[] args) throws Exception {
|
||||
final List<String> coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples",
|
||||
"exact.model", "prior.type", "runtime", "n.evaluations");
|
||||
|
||||
final PrintStream out = new PrintStream(new FileOutputStream(args[1]));
|
||||
|
||||
final List<ModelParams> modelParams = Arrays.asList(
|
||||
new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10),
|
||||
// new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10),
|
||||
new ModelParams(AFCalcFactory.Calculation.EXACT_CONSTRAINED, 10000, 100),
|
||||
new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000));
|
||||
|
||||
final boolean ONLY_HUMAN_PRIORS = false;
|
||||
final List<AFCalcTestBuilder.PriorType> priorTypes = ONLY_HUMAN_PRIORS
|
||||
? Arrays.asList(AFCalcTestBuilder.PriorType.values())
|
||||
: Arrays.asList(AFCalcTestBuilder.PriorType.human);
|
||||
|
||||
final List<Analysis> analyzes = new ArrayList<Analysis>();
|
||||
analyzes.add(new AnalyzeByACAndPL(coreColumns));
|
||||
analyzes.add(new AnalyzeBySingletonPosition(coreColumns));
|
||||
//analyzes.add(new AnalyzeByNonInformative(coreColumns));
|
||||
|
||||
for ( int iteration = 0; iteration < 1; iteration++ ) {
|
||||
for ( final int nAltAlleles : Arrays.asList(1, 2) ) {
|
||||
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
|
||||
for ( final ModelParams modelToRun : modelParams) {
|
||||
if ( modelToRun.meetsConstraints(nAltAlleles, nSamples) ) {
|
||||
for ( final AFCalcTestBuilder.PriorType priorType : priorTypes ) {
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, nAltAlleles, modelToRun.modelType, priorType);
|
||||
|
||||
for ( final Analysis analysis : analyzes ) {
|
||||
logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType, analysis.getName())));
|
||||
final List<?> values = Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType);
|
||||
analysis.run(testBuilder, (List<Object>)values);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final GATKReport report = new GATKReport();
|
||||
for ( final Analysis analysis : analyzes )
|
||||
report.addTable(analysis.getTable());
|
||||
report.print(out);
|
||||
out.close();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,170 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
public class AFCalcTestBuilder {
|
||||
final static Allele A = Allele.create("A", true);
|
||||
final static Allele C = Allele.create("C");
|
||||
final static Allele G = Allele.create("G");
|
||||
final static Allele T = Allele.create("T");
|
||||
final static Allele AA = Allele.create("AA");
|
||||
final static Allele AT = Allele.create("AT");
|
||||
final static Allele AG = Allele.create("AG");
|
||||
|
||||
static int sampleNameCounter = 0;
|
||||
|
||||
final int nSamples;
|
||||
final int numAltAlleles;
|
||||
final AFCalcFactory.Calculation modelType;
|
||||
final PriorType priorType;
|
||||
|
||||
public AFCalcTestBuilder(final int nSamples, final int numAltAlleles,
|
||||
final AFCalcFactory.Calculation modelType, final PriorType priorType) {
|
||||
this.nSamples = nSamples;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.modelType = modelType;
|
||||
this.priorType = priorType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("AFCalcTestBuilder nSamples=%d nAlts=%d model=%s prior=%s", nSamples, numAltAlleles, modelType, priorType);
|
||||
}
|
||||
|
||||
public enum PriorType {
|
||||
flat,
|
||||
human
|
||||
}
|
||||
|
||||
public int getnSamples() {
|
||||
return nSamples;
|
||||
}
|
||||
|
||||
public AFCalc makeModel() {
|
||||
return AFCalcFactory.createAFCalc(modelType, nSamples, 4, 4, 2);
|
||||
}
|
||||
|
||||
public double[] makePriors() {
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
|
||||
switch ( priorType ) {
|
||||
case flat:
|
||||
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
case human:
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
|
||||
return humanPriors;
|
||||
default:
|
||||
throw new RuntimeException("Unexpected type " + priorType);
|
||||
}
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final List<Integer> ACs, final int nNonInformative, final int nonTypePL) {
|
||||
return makeACTest(ArrayUtils.toPrimitive(ACs.toArray(new Integer[]{})), nNonInformative, nonTypePL);
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final int[] ACs, final int nNonInformative, final int nonTypePL) {
|
||||
final int nChrom = nSamples * 2;
|
||||
|
||||
final int[] nhet = new int[numAltAlleles];
|
||||
final int[] nhomvar = new int[numAltAlleles];
|
||||
|
||||
for ( int i = 0; i < ACs.length; i++ ) {
|
||||
final double p = ACs[i] / (1.0 * nChrom);
|
||||
nhomvar[i] = (int)Math.floor((nSamples - nNonInformative) * p * p);
|
||||
nhet[i] = ACs[i] - 2 * nhomvar[i];
|
||||
|
||||
if ( nhet[i] < 0 )
|
||||
throw new IllegalStateException("Bug! nhet[i] < 0");
|
||||
}
|
||||
|
||||
final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar);
|
||||
if ( calcAC != MathUtils.sum(ACs) )
|
||||
throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs));
|
||||
|
||||
return makeACTest(nhet, nhomvar, nNonInformative, nonTypePL);
|
||||
}
|
||||
|
||||
public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nNonInformative, final int nonTypePL) {
|
||||
List<Genotype> samples = new ArrayList<Genotype>(nSamples);
|
||||
|
||||
for ( int altI = 0; altI < nhet.length; altI++ ) {
|
||||
for ( int i = 0; i < nhet[altI]; i++ )
|
||||
samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1));
|
||||
for ( int i = 0; i < nhomvar[altI]; i++ )
|
||||
samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1));
|
||||
}
|
||||
|
||||
final Genotype nonInformative = makeNonInformative();
|
||||
samples.addAll(Collections.nCopies(nNonInformative, nonInformative));
|
||||
|
||||
final int nRef = Math.max((int) (nSamples - nNonInformative - MathUtils.sum(nhet) - MathUtils.sum(nhomvar)), 0);
|
||||
samples.addAll(Collections.nCopies(nRef, makePL(GenotypeType.HOM_REF, nonTypePL, 0)));
|
||||
|
||||
samples = samples.subList(0, nSamples);
|
||||
|
||||
if ( samples.size() > nSamples )
|
||||
throw new IllegalStateException("too many samples");
|
||||
|
||||
VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles());
|
||||
vcb.genotypes(samples);
|
||||
return vcb.make();
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() {
|
||||
return Arrays.asList(A, C, G, T, AA, AT, AG).subList(0, numAltAlleles+1);
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles(final GenotypeType type, final int altI) {
|
||||
switch (type) {
|
||||
case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0));
|
||||
case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI));
|
||||
case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI));
|
||||
default: throw new IllegalArgumentException("Unexpected type " + type);
|
||||
}
|
||||
}
|
||||
|
||||
public Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(expectedGT);
|
||||
gb.PL(pls);
|
||||
return gb.make();
|
||||
}
|
||||
|
||||
private int numPLs() {
|
||||
return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2);
|
||||
}
|
||||
|
||||
public Genotype makeNonInformative() {
|
||||
final int[] nonInformativePLs = new int[GenotypeLikelihoods.numLikelihoods(numAltAlleles, 2)];
|
||||
return makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), nonInformativePLs);
|
||||
}
|
||||
|
||||
public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(getAlleles(type, altI));
|
||||
|
||||
final int[] pls = new int[numPLs()];
|
||||
Arrays.fill(pls, nonTypePL);
|
||||
|
||||
int index = 0;
|
||||
switch ( type ) {
|
||||
case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break;
|
||||
case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break;
|
||||
case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break;
|
||||
}
|
||||
pls[index] = 0;
|
||||
gb.PL(pls);
|
||||
|
||||
return gb.make();
|
||||
}
|
||||
}
|
||||
|
|
@ -23,56 +23,55 @@
|
|||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.ProbabilityVector;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
public class GeneralPloidyExactAFCalc extends ExactAFCalc {
|
||||
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
private final int ploidy;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final static boolean VERBOSE = false;
|
||||
|
||||
protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
ploidy = UAC.samplePloidy;
|
||||
this.UAC = UAC;
|
||||
|
||||
protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int maxAltAllelesForIndels, final int ploidy) {
|
||||
super(nSamples, maxAltAlleles, maxAltAllelesForIndels, ploidy);
|
||||
this.ploidy = ploidy;
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
@Override
|
||||
protected VariantContext reduceScope(VariantContext vc) {
|
||||
final int maxAltAlleles = vc.getType().equals(VariantContext.Type.INDEL) ? maxAlternateAllelesForIndels : maxAlternateAllelesToGenotype;
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
if ( vc.getAlternateAlleles().size() > maxAltAlleles) {
|
||||
logger.warn("this tool is currently set to genotype at most " + maxAltAlleles + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
final List<Allele> alleles = new ArrayList<Allele>(maxAltAlleles + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, maxAltAlleles, ploidy));
|
||||
|
||||
|
||||
GLs = subsetAlleles(vc, alleles, false, ploidy);
|
||||
VariantContextBuilder builder = new VariantContextBuilder(vc);
|
||||
builder.alleles(alleles);
|
||||
builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
|
||||
return builder.make();
|
||||
} else {
|
||||
return vc;
|
||||
}
|
||||
|
||||
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
@Override
|
||||
public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
|
||||
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors, getResultTracker());
|
||||
return resultFromTracker(vc, log10AlleleFrequencyPriors);
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple wrapper class to hold values of combined pool likelihoods.
|
||||
|
|
@ -94,8 +93,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
|
||||
public void add(ExactACset set) {
|
||||
alleleCountSetList.add(set);
|
||||
conformationMap.put(set.ACcounts, set);
|
||||
final double likelihood = set.log10Likelihoods[0];
|
||||
conformationMap.put(set.getACcounts(), set);
|
||||
final double likelihood = set.getLog10Likelihoods()[0];
|
||||
|
||||
if (likelihood > maxLikelihood )
|
||||
maxLikelihood = likelihood;
|
||||
|
|
@ -108,11 +107,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
}
|
||||
|
||||
public double getLikelihoodOfConformation(int[] ac) {
|
||||
return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0];
|
||||
return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0];
|
||||
}
|
||||
|
||||
public double getGLOfACZero() {
|
||||
return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list
|
||||
return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
|
|
@ -136,7 +135,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), true);
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
|
|
@ -171,15 +170,15 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param numAlleles Number of alternate alleles
|
||||
* @param ploidyPerPool Number of samples per pool
|
||||
* @param log10AlleleFrequencyPriors Frequency priors
|
||||
* @param result object to fill with output values
|
||||
* @param resultTracker object to fill with output values
|
||||
*/
|
||||
protected static void combineSinglePools(final GenotypesContext GLs,
|
||||
final int numAlleles,
|
||||
final int ploidyPerPool,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs, true);
|
||||
|
||||
|
||||
int combinedPloidy = 0;
|
||||
|
|
@ -190,20 +189,27 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
// first element: zero ploidy, e.g. trivial degenerate distribution
|
||||
final int[] zeroCounts = new int[numAlleles];
|
||||
final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
set.log10Likelihoods[0] = 0.0;
|
||||
set.getLog10Likelihoods()[0] = 0.0;
|
||||
|
||||
combinedPoolLikelihoods.add(set);
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
result.reset();
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, result);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
|
||||
if ( genotypeLikelihoods.size() <= 1 ) {
|
||||
// no meaningful GLs at all, just set the tracker to non poly values
|
||||
resultTracker.reset(); // just mimic-ing call below
|
||||
resultTracker.setLog10LikelihoodOfAFzero(0.0);
|
||||
} else {
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
resultTracker.reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, resultTracker);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
|
||||
|
||||
|
|
@ -220,19 +226,24 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
indexesToACset.put(zeroSet.getACcounts(), zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
StateTracker stateTracker = new StateTracker();
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
resultTracker.incNEvaluations();
|
||||
// compute log10Likelihoods
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLog10L, ACqueue, indexesToACset);
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, resultTracker, stateTracker, ACqueue, indexesToACset);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
if ( log10LofKs > stateTracker.getMaxLog10L())
|
||||
stateTracker.update(log10LofKs, ACset.getACcounts());
|
||||
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
indexesToACset.remove(ACset.getACcounts());
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
|
||||
|
||||
}
|
||||
return newPool;
|
||||
|
|
@ -248,8 +259,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param log10AlleleFrequencyPriors Prior object
|
||||
* @param originalPloidy Total ploidy of original combined pool
|
||||
* @param newGLPloidy Ploidy of GL vector
|
||||
* @param result AFResult object
|
||||
* @param maxLog10L max likelihood observed so far
|
||||
* @param resultTracker AFResult object
|
||||
* @param stateTracker max likelihood observed so far
|
||||
* @param ACqueue Queue of conformations to compute
|
||||
* @param indexesToACset AC indices of objects in queue
|
||||
* @return max log likelihood
|
||||
|
|
@ -261,38 +272,40 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
final double[] log10AlleleFrequencyPriors,
|
||||
final int originalPloidy,
|
||||
final int newGLPloidy,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double maxLog10L,
|
||||
final AFCalcResultTracker resultTracker,
|
||||
final StateTracker stateTracker,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
|
||||
// compute likeihood in "set" of new set based on original likelihoods
|
||||
final int numAlleles = set.ACcounts.counts.length;
|
||||
final int numAlleles = set.getACcounts().getCounts().length;
|
||||
final int newPloidy = set.getACsum();
|
||||
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result);
|
||||
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, resultTracker);
|
||||
|
||||
|
||||
// add to new pool
|
||||
if (!Double.isInfinite(log10LofK))
|
||||
newPool.add(set);
|
||||
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
// TODO -- uncomment this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
|
||||
//if ( log10LofK < stateTracker.maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY && stateTracker.isLowerAC(set.ACcounts) ) {
|
||||
if ( log10LofK < stateTracker.getMaxLog10L() - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, stateTracker.getMaxLog10L());
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
// so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
|
||||
final int ACwiggle = set.ACcounts.counts[0];
|
||||
final int ACwiggle = set.getACcounts().getCounts()[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < numAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
final int[] ACcountsClone = set.getACcounts().getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
|
|
@ -322,11 +335,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param ploidy2 Ploidy of second pool
|
||||
* @param numAlleles Number of alleles
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
* @param resultTracker Af calculation result object
|
||||
*/
|
||||
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
/*
|
||||
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
|
||||
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
|
||||
|
|
@ -380,7 +393,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param numAlleles Number of alleles (including ref)
|
||||
* @param ploidy1 Ploidy of original pool (combined)
|
||||
* @param ploidy2 Ploidy of new pool
|
||||
* @param result AFResult object
|
||||
* @param resultTracker AFResult object
|
||||
* @return log-likehood of requested conformation
|
||||
*/
|
||||
private static double computeLofK(final ExactACset set,
|
||||
|
|
@ -388,7 +401,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
final double[] secondGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int numAlleles, final int ploidy1, final int ploidy2,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
|
|
@ -397,17 +410,18 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
if (newPloidy != totalAltK)
|
||||
throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
|
||||
|
||||
totalAltK -= set.ACcounts.counts[0];
|
||||
totalAltK -= set.getACcounts().getCounts()[0];
|
||||
// totalAltK has sum of alt alleles of conformation now
|
||||
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalAltK == 0 ) { // all-ref case
|
||||
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
|
||||
set.log10Likelihoods[0] = log10Lof0;
|
||||
set.getLog10Likelihoods()[0] = log10Lof0;
|
||||
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
return log10Lof0;
|
||||
|
||||
} else {
|
||||
|
||||
|
|
@ -415,12 +429,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
// ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
|
||||
// To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
|
||||
|
||||
int[] currentCount = set.ACcounts.getCounts();
|
||||
int[] currentCount = set.getACcounts().getCounts();
|
||||
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
|
||||
|
||||
// for current conformation, get all possible ways to break vector K into two components G1 and G2
|
||||
final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
|
||||
set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY;
|
||||
while (innerIterator.hasNext()) {
|
||||
// check if breaking current conformation into g1 and g2 is feasible.
|
||||
final int[] acCount2 = innerIterator.getCurrentVector();
|
||||
|
|
@ -436,27 +450,27 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
|
||||
final double sum = firstGL + gl2 + num1 + num2;
|
||||
|
||||
set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum);
|
||||
set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum);
|
||||
}
|
||||
}
|
||||
innerIterator.next();
|
||||
}
|
||||
|
||||
set.log10Likelihoods[0] += denom;
|
||||
set.getLog10Likelihoods()[0] += denom;
|
||||
}
|
||||
|
||||
double log10LofK = set.log10Likelihoods[0];
|
||||
double log10LofK = set.getLog10Likelihoods()[0];
|
||||
|
||||
// update the MLE if necessary
|
||||
final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length);
|
||||
result.updateMLEifNeeded(log10LofK, altCounts);
|
||||
final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
|
||||
resultTracker.updateMLEifNeeded(log10LofK, altCounts);
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for (final int ACcount : altCounts ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
result.updateMAPifNeeded(log10LofK, altCounts);
|
||||
resultTracker.updateMAPifNeeded(log10LofK, altCounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
|
@ -488,12 +502,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
* @param resultTracker Af calculation result object
|
||||
* @return Combined likelihood vector
|
||||
*/
|
||||
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
|
||||
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
final AFCalcResultTracker resultTracker) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
|
|
@ -518,8 +532,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
|
||||
|
||||
final double log10Lof0 = x[0]+y[0];
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
resultTracker.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
resultTracker.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
|
||||
double maxElement = log10Lof0;
|
||||
int maxElementIdx = 0;
|
||||
|
|
@ -561,8 +575,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
}
|
||||
|
||||
alleleCounts[0] = k;
|
||||
result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
|
||||
result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
|
||||
resultTracker.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
|
||||
resultTracker.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -614,7 +628,10 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
|
|||
// create the new likelihoods array from the alleles we are allowed to use
|
||||
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
|
||||
double[] newLikelihoods;
|
||||
if ( numOriginalAltAlleles == numNewAltAlleles) {
|
||||
|
||||
// Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization
|
||||
// and subsetting
|
||||
if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) {
|
||||
newLikelihoods = originalLikelihoods;
|
||||
} else {
|
||||
newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse);
|
||||
|
|
@ -2,6 +2,9 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Comparator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
|
|
@ -9,7 +12,7 @@ import org.jgrapht.graph.DefaultDirectedGraph;
|
|||
*/
|
||||
|
||||
// simple edge class for connecting nodes in the graph
|
||||
public class DeBruijnEdge implements Comparable<DeBruijnEdge> {
|
||||
public class DeBruijnEdge {
|
||||
|
||||
private int multiplicity;
|
||||
private boolean isRef;
|
||||
|
|
@ -53,8 +56,10 @@ public class DeBruijnEdge implements Comparable<DeBruijnEdge> {
|
|||
return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo( final DeBruijnEdge that ) {
|
||||
return this.multiplicity - that.multiplicity;
|
||||
public static class EdgeWeightComparator implements Comparator<DeBruijnEdge>, Serializable {
|
||||
@Override
|
||||
public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) {
|
||||
return edge1.multiplicity - edge2.multiplicity;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ public class DeBruijnVertex {
|
|||
public final int kmer;
|
||||
|
||||
public DeBruijnVertex( final byte[] sequence, final int kmer ) {
|
||||
this.sequence = sequence;
|
||||
this.sequence = sequence.clone();
|
||||
this.kmer = kmer;
|
||||
}
|
||||
|
||||
|
|
@ -37,7 +37,7 @@ public class DeBruijnVertex {
|
|||
}
|
||||
|
||||
public byte[] getSequence() {
|
||||
return sequence;
|
||||
return sequence.clone();
|
||||
}
|
||||
|
||||
public byte[] getSuffix() {
|
||||
|
|
|
|||
|
|
@ -52,7 +52,11 @@ public class GenotypingEngine {
|
|||
noCall.add(Allele.NO_CALL);
|
||||
}
|
||||
|
||||
// This function is the streamlined approach, currently not being used
|
||||
// WARN
|
||||
// This function is the streamlined approach, currently not being used by default
|
||||
// WARN
|
||||
// WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
|
||||
// WARN
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final ArrayList<Haplotype> haplotypes,
|
||||
|
|
@ -184,6 +188,7 @@ public class GenotypingEngine {
|
|||
return returnCalls;
|
||||
}
|
||||
|
||||
// BUGBUG: Create a class to hold this complicated return type
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
|
||||
final ArrayList<Haplotype> haplotypes,
|
||||
|
|
@ -210,13 +215,8 @@ public class GenotypingEngine {
|
|||
System.out.println( ">> Events = " + h.getEventMap());
|
||||
}
|
||||
}
|
||||
// Create the VC merge priority list
|
||||
final ArrayList<String> priorityList = new ArrayList<String>();
|
||||
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
|
||||
priorityList.add("HC" + iii);
|
||||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes, priorityList );
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes );
|
||||
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
|
||||
}
|
||||
|
|
@ -229,13 +229,16 @@ public class GenotypingEngine {
|
|||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
|
||||
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
|
||||
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
priorityList.add(vc.getSource());
|
||||
}
|
||||
}
|
||||
} else { // we are in GGA mode!
|
||||
|
|
@ -260,11 +263,27 @@ public class GenotypingEngine {
|
|||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
// Sanity check the priority list
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
if( !priorityList.contains(vc.getSource()) ) {
|
||||
throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles.");
|
||||
}
|
||||
}
|
||||
for( final String name : priorityList ) {
|
||||
boolean found = false;
|
||||
for( final VariantContext vc : eventsAtThisLoc ) {
|
||||
if(vc.getSource().equals(name)) { found = true; break; }
|
||||
}
|
||||
if( !found ) {
|
||||
throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles.");
|
||||
}
|
||||
}
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
|
|
@ -289,9 +308,20 @@ public class GenotypingEngine {
|
|||
}
|
||||
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
|
||||
VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
if( call != null ) {
|
||||
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
|
||||
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
|
||||
// also, need to update the allele -> haplotype mapping
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
|
||||
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
|
||||
}
|
||||
|
||||
call = vcCallTrim;
|
||||
alleleHashMap = alleleHashMapTrim;
|
||||
}
|
||||
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
|
||||
}
|
||||
}
|
||||
|
|
@ -299,9 +329,8 @@ public class GenotypingEngine {
|
|||
return returnCalls;
|
||||
}
|
||||
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes, final ArrayList<String> priorityList ) {
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes ) {
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
final ArrayList<String> stringsToRemove = new ArrayList<String>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().values() ) {
|
||||
if( vc.isSymbolic() ) {
|
||||
|
|
@ -309,7 +338,6 @@ public class GenotypingEngine {
|
|||
for( final VariantContext vc2 : h2.getEventMap().values() ) {
|
||||
if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) {
|
||||
haplotypesToRemove.add(h);
|
||||
stringsToRemove.add(vc.getSource());
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
@ -318,7 +346,6 @@ public class GenotypingEngine {
|
|||
}
|
||||
}
|
||||
haplotypes.removeAll(haplotypesToRemove);
|
||||
priorityList.removeAll(stringsToRemove);
|
||||
}
|
||||
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
|
|
|
|||
|
|
@ -27,29 +27,23 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionType;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
|
|
@ -57,6 +51,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
|
|||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -106,6 +101,7 @@ import java.util.*;
|
|||
|
||||
@DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@PartitionBy(PartitionType.LOCUS)
|
||||
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
|
||||
@ActiveRegionExtension(extension=65, maxRegion=300)
|
||||
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatible {
|
||||
|
||||
|
|
@ -177,7 +173,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
* so annotations will be excluded even if they are explicitly included with the other options.
|
||||
*/
|
||||
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
||||
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"HaplotypeScore", "MappingQualityZero", "SpanningDeletions", "TandemRepeatAnnotator"}));
|
||||
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));
|
||||
|
||||
/**
|
||||
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
||||
|
|
@ -241,9 +237,13 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING);
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING);
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
|
||||
|
||||
// create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
|
||||
UnifiedArgumentCollection simpleUAC = UAC.clone();
|
||||
simpleUAC.exactCallsLog = null;
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// initialize the output VCF header
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
|
@ -312,7 +312,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
|
||||
if( !allelesToGenotype.contains(vc) ) {
|
||||
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object
|
||||
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object
|
||||
}
|
||||
}
|
||||
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
|
||||
|
|
@ -414,7 +414,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
|
|||
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
|
||||
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
|
||||
|
||||
final Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
|
||||
final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
|
||||
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
|
||||
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
|
||||
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import org.apache.commons.lang.ArrayUtils;
|
|||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -76,13 +77,15 @@ public class KBestPaths {
|
|||
}
|
||||
}
|
||||
|
||||
protected static class PathComparatorTotalScore implements Comparator<Path> {
|
||||
protected static class PathComparatorTotalScore implements Comparator<Path>, Serializable {
|
||||
@Override
|
||||
public int compare(final Path path1, final Path path2) {
|
||||
return path1.totalScore - path2.totalScore;
|
||||
}
|
||||
}
|
||||
|
||||
//protected static class PathComparatorLowestEdge implements Comparator<Path> {
|
||||
//protected static class PathComparatorLowestEdge implements Comparator<Path>, Serializable {
|
||||
// @Override
|
||||
// public int compare(final Path path1, final Path path2) {
|
||||
// return path2.lowestEdge - path1.lowestEdge;
|
||||
// }
|
||||
|
|
@ -124,7 +127,7 @@ public class KBestPaths {
|
|||
// recursively run DFS
|
||||
final ArrayList<DeBruijnEdge> edgeArrayList = new ArrayList<DeBruijnEdge>();
|
||||
edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex));
|
||||
Collections.sort(edgeArrayList);
|
||||
Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator());
|
||||
Collections.reverse(edgeArrayList);
|
||||
for ( final DeBruijnEdge edge : edgeArrayList ) {
|
||||
// make sure the edge is not already in the path
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
|||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.PerReadAlleleLikelihoodMap;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
|
@ -39,7 +40,6 @@ import java.util.*;
|
|||
public class LikelihoodCalculationEngine {
|
||||
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
|
||||
private final byte constantGCP;
|
||||
private final boolean DEBUG;
|
||||
private final PairHMM pairHMM;
|
||||
|
|
@ -77,10 +77,10 @@ public class LikelihoodCalculationEngine {
|
|||
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
|
||||
|
||||
// for each sample's reads
|
||||
for( final String sample : perSampleReadList.keySet() ) {
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
|
||||
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
computeReadLikelihoods( haplotypes, perSampleReadList.get(sample), sample, matchMetricArray, XMetricArray, YMetricArray );
|
||||
computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey(), matchMetricArray, XMetricArray, YMetricArray );
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -183,7 +183,7 @@ public class LikelihoodCalculationEngine {
|
|||
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -322,19 +322,21 @@ public class LikelihoodCalculationEngine {
|
|||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
public static Map<String, Map<Allele, List<GATKSAMRecord>>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList, final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
|
||||
final Map<String, Map<Allele, List<GATKSAMRecord>>> returnMap = new HashMap<String, Map<Allele, List<GATKSAMRecord>>>();
|
||||
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
|
||||
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
|
||||
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
|
||||
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
|
||||
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>();
|
||||
final PerReadAlleleLikelihoodMap likelihoodMap = new PerReadAlleleLikelihoodMap();
|
||||
|
||||
final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue();
|
||||
for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
|
||||
final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
final double likelihoods[] = new double[call.getFirst().getAlleles().size()];
|
||||
int count = 0;
|
||||
for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood
|
||||
for( final Allele a : call.getFirst().getAlleles() ) {
|
||||
double maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
|
||||
final double likelihood = h.getReadLikelihoods(sample.getKey())[iii];
|
||||
|
|
@ -342,43 +344,21 @@ public class LikelihoodCalculationEngine {
|
|||
maxLikelihood = likelihood;
|
||||
}
|
||||
}
|
||||
likelihoods[count++] = maxLikelihood;
|
||||
likelihoodMap.add(read, a, maxLikelihood);
|
||||
}
|
||||
final int bestAllele = MathUtils.maxElementIndex(likelihoods);
|
||||
final double bestLikelihood = likelihoods[bestAllele];
|
||||
Allele allele = Allele.NO_CALL;
|
||||
boolean isInformativeRead = false;
|
||||
for( final double likelihood : likelihoods ) {
|
||||
if( bestLikelihood - likelihood > BEST_LIKELIHOOD_THRESHOLD ) {
|
||||
isInformativeRead = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// uninformative reads get the no call Allele
|
||||
if( isInformativeRead ) {
|
||||
allele = call.getFirst().getAlleles().get(bestAllele);
|
||||
}
|
||||
List<GATKSAMRecord> readList = alleleReadMap.get(allele);
|
||||
if( readList == null ) {
|
||||
readList = new ArrayList<GATKSAMRecord>();
|
||||
alleleReadMap.put(allele, readList);
|
||||
}
|
||||
readList.add(read);
|
||||
}
|
||||
}
|
||||
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
|
||||
List<GATKSAMRecord> readList = alleleReadMap.get(Allele.NO_CALL);
|
||||
if( readList == null ) {
|
||||
readList = new ArrayList<GATKSAMRecord>();
|
||||
alleleReadMap.put(Allele.NO_CALL, readList);
|
||||
}
|
||||
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
readList.add(read);
|
||||
for( final Allele a : call.getFirst().getAlleles() )
|
||||
likelihoodMap.add(read, a, 0.0);
|
||||
}
|
||||
}
|
||||
returnMap.put(sample.getKey(), alleleReadMap);
|
||||
|
||||
returnMap.put(sample.getKey(), likelihoodMap);
|
||||
|
||||
}
|
||||
return returnMap;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -184,7 +184,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
for( final GATKSAMRecord read : reads ) {
|
||||
final byte[] sequence = read.getReadBases();
|
||||
final byte[] qualities = read.getBaseQualities();
|
||||
final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not readuced
|
||||
final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
|
||||
if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
|
|
@ -201,7 +201,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
|||
// compute mean number of reduced read counts in current kmer span
|
||||
final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1);
|
||||
// precise rounding can make a difference with low consensus counts
|
||||
countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length);
|
||||
countNumber = MathUtils.arrayMax(counts);
|
||||
// countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length);
|
||||
}
|
||||
|
||||
if( !badKmer ) {
|
||||
|
|
|
|||
|
|
@ -34,7 +34,6 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
" -I " + bam +
|
||||
" -L " + interval +
|
||||
args +
|
||||
" --no_plots" +
|
||||
" -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) +
|
||||
" -o %s";
|
||||
}
|
||||
|
|
@ -50,21 +49,21 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
|
||||
String HiSeqInterval = "chr1:10,000,000-10,100,000";
|
||||
return new Object[][]{
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "1cfc73371abb933ca26496745d105ff0")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "ee5142776008741b1b2453b1258c6d99")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "fbc520794f0f98d52159de956f7217f1")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab5b93794049c514bf8e407019d76b67")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "81df636e3d0ed6f16113517e0169bc96")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "ad3c47355448f8c45e172c6e1129c65d")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "fef7240140a9b6d6335ce009fa4edec5")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "769f95b9dcc78a405d3e6b191e5a19f5")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "43fcba51264cc98bd8466d21e1b96766")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "48aaf9ac54b97eac3663882a59354ab2")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "dac04b9e1e1c52af8d3a50c2e550fda9")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "90d70542076715a8605a8d4002614b34")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "600652ee49b9ce1ca2d8ee2d8b7c8211")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "26a04f5a28c40750c603cbe8a926d7bd")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "55a46d8f5d2f9acfa2d7659e18b6df43")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "8e930f56a8905a5999af7d6ba8a92f91")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "8e87bee4bd6531b405082c4da785f1f5")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "b309a5f57b861d7f31cb76cdac4ff8a7")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "4c75d47ed2cf93b499be8fbb29b24dfd")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "43b06e5568a89e4ce1dd9146ce580c89")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "25f4f48dba27475b0cd7c06ef0239aba")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "dfcba9acc32b4a1dfeceea135b48615a")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "e8077b721f2e6f51c1945b6f6236835c")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "fbdc8d0fd312e3a7f49063c580cf5d92")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "4f47415628201a4f3c33e48ec066677b")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "1e89d2b88f4218363b9322b38e9536f2")},
|
||||
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "a7beb0b16756257a274eecf73474ed90")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "dfcba9acc32b4a1dfeceea135b48615a")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "2082c70e08f1c14290c3812021832f83")},
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -75,11 +74,10 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
Arrays.asList(params.md5));
|
||||
executeTest("testBQSR-"+params.args, spec).getFirst();
|
||||
|
||||
// TODO -- re-enable once parallelization is fixed in BaseRecalibrator
|
||||
//WalkerTestSpec specNT2 = new WalkerTestSpec(
|
||||
// params.getCommandLine() + " -nt 2",
|
||||
// Arrays.asList(params.md5));
|
||||
//executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
|
||||
WalkerTestSpec specNT2 = new WalkerTestSpec(
|
||||
params.getCommandLine() + " -nt 2",
|
||||
Arrays.asList(params.md5));
|
||||
executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -89,7 +87,6 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
" -R " + b36KGReference +
|
||||
" -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" +
|
||||
" -L 1:10,000,000-10,200,000" +
|
||||
" --no_plots" +
|
||||
" -o %s",
|
||||
1, // just one output file
|
||||
UserException.CommandLineException.class);
|
||||
|
|
@ -103,7 +100,6 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
" -R " + b36KGReference +
|
||||
" -I " + privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam" +
|
||||
" -L 1:50,000-80,000" +
|
||||
" --no_plots" +
|
||||
" -o %s",
|
||||
1, // just one output file
|
||||
UserException.class);
|
||||
|
|
@ -128,10 +124,10 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
@DataProvider(name = "PRTest")
|
||||
public Object[][] createPRTestData() {
|
||||
return new Object[][]{
|
||||
{new PRTest("", "d2d6ed8667cdba7e56f5db97d6262676")},
|
||||
{new PRTest(" -qq -1", "b7053d3d67aba6d8892f0a60f0ded338")},
|
||||
{new PRTest(" -qq 6", "bfbf0855185b2b70aa35237fb71e4487")},
|
||||
{new PRTest(" -DIQ", "66aa65223f192ee39c1773aa187fd493")}
|
||||
{new PRTest("", "ab2f209ab98ad3432e208cbd524a4c4a")},
|
||||
{new PRTest(" -qq -1", "5226c06237b213b9e9b25a32ed92d09a")},
|
||||
{new PRTest(" -qq 6", "b592a5c62b952a012e18adb898ea9c33")},
|
||||
{new PRTest(" -DIQ", "8977bea0c57b808e65e9505eb648cdf7")}
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -141,7 +137,7 @@ public class BQSRIntegrationTest extends WalkerTest {
|
|||
"-T PrintReads" +
|
||||
" -R " + hg18Reference +
|
||||
" -I " + privateTestDir + "HiSeq.1mb.1RG.bam" +
|
||||
" -BQSR " + privateTestDir + "HiSeq.1mb.1RG.table" +
|
||||
" -BQSR " + privateTestDir + "HiSeq.20mb.1RG.table" +
|
||||
params.args +
|
||||
" -o %s",
|
||||
Arrays.asList(params.md5));
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ public class BaseCountsUnitTest extends BaseTest {
|
|||
|
||||
String name = String.format("Test-%s", params.bases);
|
||||
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
|
||||
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
|
||||
Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name);
|
||||
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
|
||||
}
|
||||
}
|
||||
|
|
@ -21,33 +21,33 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
@Test(enabled = false)
|
||||
public void testDefaultCompression() {
|
||||
RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
@Test(enabled = false)
|
||||
public void testMultipleIntervals() {
|
||||
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||
RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
@Test(enabled = false)
|
||||
public void testHighCompression() {
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
@Test(enabled = false)
|
||||
public void testLowCompression() {
|
||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
@Test(enabled = false)
|
||||
public void testIndelCompression() {
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
@Test(enabled = false)
|
||||
public void testFilteredDeletionCompression() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
|
||||
|
|
@ -61,7 +61,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
*
|
||||
* This bam is simplified to replicate the exact bug with the three provided intervals.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
@Test(enabled = false)
|
||||
public void testAddingReadAfterTailingTheStash() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6")));
|
||||
|
|
@ -71,7 +71,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
|
|||
* Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get
|
||||
* filtered out.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
@Test(enabled = false)
|
||||
public void testDivideByZero() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
|
||||
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2")));
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ public void testBaseCounts() {
|
|||
new TestRead(bases, quals, new Byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})};
|
||||
|
||||
for (TestRead testRead : testReads) {
|
||||
SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false);
|
||||
SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false);
|
||||
Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
|
|
@ -18,8 +18,9 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
|
|||
final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
|
||||
final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
|
||||
final String REFSAMPLE_NAME = "NA12878";
|
||||
final String MTINTERVALS = "MT:1-3000";
|
||||
final String LSVINTERVALS = "20:40,000,000-41,000,000";
|
||||
final String MTINTERVALS = "MT:1-1000";
|
||||
final String LSVINTERVALS = "20:40,500,000-41,000,000";
|
||||
final String LSVINTERVALS_SHORT = "20:40,500,000-40,501,000";
|
||||
final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
|
||||
final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
|
||||
final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
|
||||
|
|
@ -38,6 +39,13 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
|
|||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
private void PC_LSV_Test_short(String args, String name, String model, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane ",
|
||||
REF, LSV_BAM, LSVINTERVALS_SHORT, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane",
|
||||
REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s";
|
||||
|
|
@ -45,33 +53,38 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
|
|||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testSNP_ACS_Pools() {
|
||||
PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","ec19f0b7c7d57493cecfff988a4815c8");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testBOTH_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0934f72865388999efec64bd9d4a9b93");
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","67dabdbf1e6ed8a83d2e85766558a20a");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","126581c72d287722437274d41b6fed7b");
|
||||
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","d4bfae27f1b07923f381d708d8a34cf4");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","b543aa1c3efedb301e525c1d6c50ed8d");
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","7d6f319b9edcb1ff8c290fef150a2df8");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","55b20557a836bb92688e68f12d7f5dc4");
|
||||
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","dd02890123e07e7412a49475cb6280f1");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","7eb889e8e07182f4c3d64609591f9459");
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","da84bf45f7080a46a7a78542b3a0629d");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMT_SNP_GGA_sp10() {
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "db8114877b99b14f7180fdcd24b040a7");
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "f8ea18ec6a717a77fdf8c5f2482d8d8d");
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,603 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class AFCalcUnitTest extends BaseTest {
|
||||
static Allele A = Allele.create("A", true);
|
||||
static Allele C = Allele.create("C");
|
||||
static Allele G = Allele.create("G");
|
||||
|
||||
static int sampleNameCounter = 0;
|
||||
static Genotype AA1, AB1, BB1, NON_INFORMATIVE1;
|
||||
static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2;
|
||||
final double[] FLAT_3SAMPLE_PRIORS = MathUtils.normalizeFromLog10(new double[2*3+1], true); // flat priors
|
||||
|
||||
final private static boolean INCLUDE_BIALLELIC = true;
|
||||
final private static boolean INCLUDE_TRIALLELIC = true;
|
||||
final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug
|
||||
final private static boolean DEBUG_ONLY = false;
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
AA1 = makePL(Arrays.asList(A, A), 0, 20, 20);
|
||||
AB1 = makePL(Arrays.asList(A, C), 20, 0, 20);
|
||||
BB1 = makePL(Arrays.asList(C, C), 20, 20, 0);
|
||||
NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
|
||||
|
||||
AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20);
|
||||
AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20);
|
||||
BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20);
|
||||
AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20);
|
||||
BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20);
|
||||
CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0);
|
||||
NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
protected static Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
|
||||
gb.alleles(expectedGT);
|
||||
gb.PL(pls);
|
||||
return gb.make();
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
final AFCalc calc;
|
||||
final int[] expectedACs;
|
||||
final double[] priors;
|
||||
final String priorName;
|
||||
|
||||
private GetGLsTest(final AFCalc calc, int numAltAlleles, List<Genotype> arg, final double[] priors, final String priorName) {
|
||||
super(GetGLsTest.class);
|
||||
GLs = GenotypesContext.create(new ArrayList<Genotype>(arg));
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.calc = calc;
|
||||
this.priors = priors;
|
||||
this.priorName = priorName;
|
||||
|
||||
expectedACs = new int[numAltAlleles+1];
|
||||
for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) {
|
||||
expectedACs[alleleI] = 0;
|
||||
final Allele allele = getAlleles().get(alleleI);
|
||||
for ( Genotype g : arg ) {
|
||||
expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public AFCalcResult execute() {
|
||||
return getCalc().getLog10PNonRef(getVC(), getPriors());
|
||||
}
|
||||
|
||||
public AFCalcResult executeRef() {
|
||||
final AFCalc ref = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_REFERENCE, getCalc().nSamples, getCalc().getMaxAltAlleles());
|
||||
return ref.getLog10PNonRef(getVC(), getPriors());
|
||||
}
|
||||
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
public AFCalc getCalc() {
|
||||
return calc;
|
||||
}
|
||||
|
||||
public VariantContext getVC() {
|
||||
VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
|
||||
builder.genotypes(GLs);
|
||||
return builder.make();
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() {
|
||||
return Arrays.asList(Allele.create("A", true),
|
||||
Allele.create("C"),
|
||||
Allele.create("G"),
|
||||
Allele.create("T")).subList(0, numAltAlleles+1);
|
||||
}
|
||||
|
||||
public int getExpectedAltAC(final int alleleI) {
|
||||
return expectedACs[alleleI+1];
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(),
|
||||
priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "wellFormedGLs")
|
||||
public Object[][] createSimpleGLsData() {
|
||||
final List<Genotype> biAllelicSamples = Arrays.asList(AA1, AB1, BB1);
|
||||
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
|
||||
|
||||
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
|
||||
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
|
||||
Arrays.asList(
|
||||
AFCalcFactory.Calculation.EXACT_REFERENCE,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
|
||||
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
|
||||
), 4, 2, 2, 2);
|
||||
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
final double[] humanPriors = new double[nPriorValues];
|
||||
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
|
||||
|
||||
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
|
||||
for ( AFCalc model : calcs ) {
|
||||
final String priorName = priors == humanPriors ? "human" : "flat";
|
||||
|
||||
// bi-allelic
|
||||
if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() )
|
||||
for ( List<Genotype> genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) )
|
||||
new GetGLsTest(model, 1, genotypes, priors, priorName);
|
||||
|
||||
// tri-allelic
|
||||
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) ) // || model != generalCalc ) )
|
||||
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
|
||||
new GetGLsTest(model, 2, genotypes, priors, priorName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
@DataProvider(name = "badGLs")
|
||||
public Object[][] createBadGLs() {
|
||||
final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
|
||||
final int nSamples = genotypes.size();
|
||||
|
||||
final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
|
||||
|
||||
final int nPriorValues = 2*nSamples+1;
|
||||
final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
|
||||
for ( AFCalc model : Arrays.asList(indCalc) ) {
|
||||
final String priorName = "flat";
|
||||
new GetGLsTest(model, 2, genotypes, priors, priorName);
|
||||
}
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
|
||||
public void testBiallelicGLs(GetGLsTest cfg) {
|
||||
if ( cfg.getAlleles().size() == 2 )
|
||||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
|
||||
public void testTriallelicGLs(GetGLsTest cfg) {
|
||||
if ( cfg.getAlleles().size() > 2 )
|
||||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "badGLs")
|
||||
public void testBadGLs(GetGLsTest cfg) {
|
||||
testResultSimple(cfg);
|
||||
}
|
||||
|
||||
private static class NonInformativeData {
|
||||
final Genotype nonInformative;
|
||||
final List<Genotype> called;
|
||||
final int nAltAlleles;
|
||||
|
||||
private NonInformativeData(List<Genotype> called, Genotype nonInformative, int nAltAlleles) {
|
||||
this.called = called;
|
||||
this.nonInformative = nonInformative;
|
||||
this.nAltAlleles = nAltAlleles;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "GLsWithNonInformative")
|
||||
public Object[][] makeGLsWithNonInformative() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<NonInformativeData> nonInformativeTests = new LinkedList<NonInformativeData>();
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1));
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2));
|
||||
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2));
|
||||
|
||||
for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) {
|
||||
for ( final NonInformativeData testData : nonInformativeTests ) {
|
||||
final List<Genotype> samples = new ArrayList<Genotype>();
|
||||
samples.addAll(testData.called);
|
||||
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
|
||||
|
||||
final int nSamples = samples.size();
|
||||
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(
|
||||
Arrays.asList(
|
||||
AFCalcFactory.Calculation.EXACT_REFERENCE,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
|
||||
AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY
|
||||
), 4, 2, 2, 2);
|
||||
|
||||
final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors
|
||||
|
||||
for ( AFCalc model : calcs ) {
|
||||
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
|
||||
|
||||
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
|
||||
Collections.rotate(samples, 1);
|
||||
final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat");
|
||||
tests.add(new Object[]{onlyInformative, withNonInformative});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "GLsWithNonInformative", dependsOnMethods = {"testBiallelicGLs", "testTriallelicGLs"})
|
||||
public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
|
||||
final AFCalcResult expected = onlyInformative.execute();
|
||||
final AFCalcResult actual = withNonInformative.execute();
|
||||
|
||||
testResultSimple(withNonInformative);
|
||||
compareAFCalcResults(actual, expected, onlyInformative.getCalc(), true);
|
||||
}
|
||||
|
||||
private void testResultSimple(final GetGLsTest cfg) {
|
||||
final AFCalcResult refResultTracker = cfg.executeRef();
|
||||
final AFCalcResult resultTracker = cfg.execute();
|
||||
|
||||
compareAFCalcResults(resultTracker, refResultTracker, cfg.getCalc(), true);
|
||||
|
||||
Assert.assertNotNull(resultTracker.getAllelesUsedInGenotyping());
|
||||
Assert.assertTrue(cfg.getAlleles().containsAll(resultTracker.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
|
||||
|
||||
for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) {
|
||||
int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI);
|
||||
int calcAC_MLE = resultTracker.getAlleleCountsOfMLE()[altAlleleI];
|
||||
|
||||
final Allele allele = cfg.getAlleles().get(altAlleleI+1);
|
||||
Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
|
||||
}
|
||||
}
|
||||
|
||||
private void compareAFCalcResults(final AFCalcResult actual, final AFCalcResult expected, final AFCalc calc, final boolean onlyPosteriorsShouldBeEqual) {
|
||||
// note we cannot really test the multi-allelic case because we actually meaningfully differ among the models here
|
||||
final double TOLERANCE = calc.getMaxAltAlleles() > 1 ? 1000 : 0.1; // much tighter constraints on bi-allelic results
|
||||
|
||||
if ( ! onlyPosteriorsShouldBeEqual ) {
|
||||
Assert.assertEquals(actual.getLog10PriorOfAFEq0(), expected.getLog10PriorOfAFEq0(), TOLERANCE, "Priors AF == 0");
|
||||
Assert.assertEquals(actual.getLog10PriorOfAFGT0(), expected.getLog10PriorOfAFGT0(), TOLERANCE, "Priors AF > 0");
|
||||
Assert.assertEquals(actual.getLog10LikelihoodOfAFEq0(), expected.getLog10LikelihoodOfAFEq0(), TOLERANCE, "Likelihoods AF == 0");
|
||||
Assert.assertEquals(actual.getLog10LikelihoodOfAFGT0(), expected.getLog10LikelihoodOfAFGT0(), TOLERANCE, "Likelihoods AF > 0");
|
||||
}
|
||||
Assert.assertEquals(actual.getLog10PosteriorOfAFEq0(), expected.getLog10PosteriorOfAFEq0(), TOLERANCE, "Posteriors AF == 0");
|
||||
Assert.assertEquals(actual.getLog10PosteriorOfAFGT0(), expected.getLog10PosteriorOfAFGT0(), TOLERANCE, "Posteriors AF > 0");
|
||||
Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE(), "MLE ACs");
|
||||
Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping(), "Alleles used in genotyping");
|
||||
|
||||
for ( final Allele a : expected.getAllelesUsedInGenotyping() ) {
|
||||
if ( ! a.isReference() ) {
|
||||
Assert.assertEquals(actual.getAlleleCountAtMLE(a), expected.getAlleleCountAtMLE(a), "MLE AC for allele " + a);
|
||||
// TODO -- enable me when IndependentAllelesDiploidExactAFCalc works properly
|
||||
// if ( ! ( calc instanceof GeneralPloidyExactAFCalc ) )
|
||||
// // TODO -- delete when general ploidy works properly with multi-allelics
|
||||
// Assert.assertEquals(actual.isPolymorphic(a, 0.0), expected.isPolymorphic(a, 0.0), "isPolymorphic with thread 0.0 for allele " + a);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
|
||||
public void testLargeGLs(final ExactAFCalc calc) {
|
||||
final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
|
||||
GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
|
||||
|
||||
final AFCalcResult resultTracker = cfg.execute();
|
||||
|
||||
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMLE()[0];
|
||||
Assert.assertEquals(calculatedAlleleCount, 6);
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
|
||||
public void testMismatchedGLs(final ExactAFCalc calc) {
|
||||
final Genotype AB = makePL(Arrays.asList(A, C), 2000, 0, 2000, 2000, 2000, 2000);
|
||||
final Genotype AC = makePL(Arrays.asList(A, G), 100, 100, 100, 0, 100, 100);
|
||||
GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat");
|
||||
|
||||
final AFCalcResult resultTracker = cfg.execute();
|
||||
|
||||
Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[0], 1);
|
||||
Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[1], 1);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Code to test that the pNonRef value is meaningful
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
private static class PNonRefData {
|
||||
final Genotype g;
|
||||
final double pNonRef, tolerance;
|
||||
final boolean canScale;
|
||||
final List<AFCalcFactory.Calculation> badModels;
|
||||
final VariantContext vc;
|
||||
|
||||
private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale) {
|
||||
this(vc, g, pNonRef, tolerance, canScale, Collections.<AFCalcFactory.Calculation>emptyList());
|
||||
}
|
||||
|
||||
private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale, final List<AFCalcFactory.Calculation> badModels) {
|
||||
this.g = g;
|
||||
this.pNonRef = pNonRef;
|
||||
this.tolerance = tolerance;
|
||||
this.canScale = canScale;
|
||||
this.badModels = badModels;
|
||||
this.vc = vc;
|
||||
}
|
||||
|
||||
public PNonRefData scale(final int scaleFactor) {
|
||||
if ( canScale ) {
|
||||
final int[] PLs = new int[g.getPL().length];
|
||||
for ( int i = 0; i < PLs.length; i++ ) PLs[i] = g.getPL()[i] * ((int)Math.log10(scaleFactor)+1);
|
||||
final Genotype scaledG = new GenotypeBuilder(g).PL(PLs).make();
|
||||
final double scaledPNonRef = pNonRef < 0.5 ? pNonRef / scaleFactor : 1 - ((1-pNonRef) / scaleFactor);
|
||||
return new PNonRefData(vc, scaledG, scaledPNonRef, tolerance, true);
|
||||
} else {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "PNonRef")
|
||||
public Object[][] makePNonRefTest() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<Allele> AA = Arrays.asList(A, A);
|
||||
final List<Allele> AC = Arrays.asList(A, C);
|
||||
final List<Allele> CC = Arrays.asList(C, C);
|
||||
final List<Allele> AG = Arrays.asList(A, G);
|
||||
final List<Allele> GG = Arrays.asList(G, G);
|
||||
final List<Allele> CG = Arrays.asList(C, G);
|
||||
|
||||
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
|
||||
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
|
||||
final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat;
|
||||
|
||||
final List<AFCalcFactory.Calculation> constrainedModel = Arrays.asList(AFCalcFactory.Calculation.EXACT_CONSTRAINED);
|
||||
|
||||
final double TOLERANCE = 0.5;
|
||||
|
||||
final List<PNonRefData> initialPNonRefData = Arrays.asList(
|
||||
// bi-allelic sites
|
||||
new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true),
|
||||
new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false, constrainedModel),
|
||||
new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false, constrainedModel),
|
||||
new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false, constrainedModel),
|
||||
new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true),
|
||||
new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true),
|
||||
|
||||
// tri-allelic sites -- cannot scale because of the naivety of our scaling estimator
|
||||
new PNonRefData(vc3, makePL(AA, 0, 10, 10, 10, 10, 10), 0.3023255813953489, TOLERANCE * 2, false), // more tolerance because constrained model is a bit inaccurate
|
||||
new PNonRefData(vc3, makePL(AC, 10, 0, 10, 10, 10, 10), 0.9166667, TOLERANCE, false),
|
||||
new PNonRefData(vc3, makePL(CC, 10, 10, 0, 10, 10, 10), 0.9166667, TOLERANCE, false),
|
||||
new PNonRefData(vc3, makePL(AG, 10, 10, 10, 0, 10, 10), 0.9166667, TOLERANCE, false),
|
||||
new PNonRefData(vc3, makePL(CG, 10, 10, 10, 10, 0, 10), 0.80, TOLERANCE, false),
|
||||
new PNonRefData(vc3, makePL(GG, 10, 10, 10, 10, 10, 0), 0.9166667, TOLERANCE, false)
|
||||
);
|
||||
|
||||
for ( AFCalcFactory.Calculation modelType : Arrays.asList(AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT) ) {
|
||||
for ( int nNonInformative = 0; nNonInformative < 3; nNonInformative++ ) {
|
||||
for ( final PNonRefData rootData : initialPNonRefData ) {
|
||||
for ( int plScale = 1; plScale <= 100000; plScale *= 10 ) {
|
||||
if ( ! rootData.badModels.contains(modelType) && (plScale == 1 || rootData.canScale) ) {
|
||||
final PNonRefData data = rootData.scale(plScale);
|
||||
tests.add(new Object[]{data.vc, modelType, priorType, Arrays.asList(data.g), data.pNonRef, data.tolerance, nNonInformative});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRef")
|
||||
private void testPNonRef(final VariantContext vcRoot,
|
||||
AFCalcFactory.Calculation modelType,
|
||||
AFCalcTestBuilder.PriorType priorType,
|
||||
final List<Genotype> genotypes,
|
||||
final double expectedPNonRef,
|
||||
final double tolerance,
|
||||
final int nNonInformative) {
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(1, vcRoot.getNAlleles()-1, modelType, priorType);
|
||||
|
||||
final VariantContextBuilder vcb = new VariantContextBuilder(vcRoot);
|
||||
vcb.genotypes(genotypes);
|
||||
|
||||
final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
|
||||
|
||||
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), Math.log10(expectedPNonRef), tolerance,
|
||||
"Actual pNonRef not within tolerance " + tolerance + " of expected");
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test priors
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "Models")
|
||||
public Object[][] makeModels() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
for ( final AFCalcFactory.Calculation calc : AFCalcFactory.Calculation.values() ) {
|
||||
if ( calc.usableForParams(2, 4) )
|
||||
tests.add(new Object[]{AFCalcFactory.createAFCalc(calc, 2, 4)});
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true & ! DEBUG_ONLY, dataProvider = "Models")
|
||||
public void testBiallelicPriors(final AFCalc model) {
|
||||
|
||||
for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
|
||||
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
|
||||
|
||||
for ( int log10NonRefPrior = 1; log10NonRefPrior < 10*REF_PL; log10NonRefPrior += 1 ) {
|
||||
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
|
||||
final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, (1-refPrior) / 2, (1-refPrior) / 2}), true);
|
||||
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
|
||||
final AFCalcResult resultTracker = cfg.execute();
|
||||
final int actualAC = resultTracker.getAlleleCountsOfMLE()[0];
|
||||
|
||||
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
|
||||
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5);
|
||||
final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior));
|
||||
final double log10NonRefPost = Math.log10(nonRefPost);
|
||||
|
||||
if ( ! Double.isInfinite(log10NonRefPost) )
|
||||
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2);
|
||||
|
||||
if ( nonRefPost >= 0.9 )
|
||||
Assert.assertTrue(resultTracker.isPolymorphic(C, -1));
|
||||
|
||||
final int expectedMLEAC = 1; // the MLE is independent of the prior
|
||||
Assert.assertEquals(actualAC, expectedMLEAC,
|
||||
"actual AC with priors " + log10NonRefPrior + " not expected "
|
||||
+ expectedMLEAC + " priors " + Utils.join(",", priors));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
|
||||
|
||||
// --------------------------------------------------------------------------------
|
||||
//
|
||||
// Test that polymorphic sites (bi and tri) are properly called
|
||||
//
|
||||
// --------------------------------------------------------------------------------
|
||||
|
||||
@DataProvider(name = "polyTestProvider")
|
||||
public Object[][] makePolyTestProvider() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
// list of all high-quality models in the system
|
||||
final List<AFCalcFactory.Calculation> models = Arrays.asList(
|
||||
AFCalcFactory.Calculation.EXACT,
|
||||
AFCalcFactory.Calculation.EXACT_REFERENCE,
|
||||
AFCalcFactory.Calculation.EXACT_INDEPENDENT);
|
||||
|
||||
// note that we cannot use small PLs here or the thresholds are hard to set
|
||||
for ( final int nonTypePLs : Arrays.asList(100, 1000) ) {
|
||||
for ( final AFCalcFactory.Calculation model : models ) {
|
||||
for ( final int allele1AC : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) {
|
||||
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
|
||||
// for ( final int nonTypePLs : Arrays.asList(10) ) {
|
||||
// for ( final AFCalcFactory.Calculation model : models ) {
|
||||
// for ( final int allele1AC : Arrays.asList(100) ) {
|
||||
// for ( final int nSamples : Arrays.asList(1000) ) {
|
||||
if ( nSamples < allele1AC ) continue;
|
||||
|
||||
final double pPerSample = Math.pow(10, nonTypePLs / -10.0);
|
||||
final double errorFreq = pPerSample * nSamples;
|
||||
final boolean poly1 = allele1AC > errorFreq && (nonTypePLs * allele1AC) > 30;
|
||||
|
||||
// bi-allelic tests
|
||||
{
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, 1, model, AFCalcTestBuilder.PriorType.human);
|
||||
final List<Integer> ACs = Arrays.asList(allele1AC);
|
||||
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1)});
|
||||
}
|
||||
|
||||
// multi-allelic tests
|
||||
for ( final int allele2AC : Arrays.asList(0, 1, 2, 10, 20, 50) ) {
|
||||
if ( nSamples < allele2AC || allele1AC + allele2AC > nSamples || nSamples > 100 || nSamples == 1)
|
||||
continue;
|
||||
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, 2, model, AFCalcTestBuilder.PriorType.human);
|
||||
final List<Integer> ACs = Arrays.asList(allele1AC, allele2AC);
|
||||
final boolean poly2 = allele2AC > errorFreq && (nonTypePLs * allele2AC) > 90;
|
||||
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1, poly2)});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProvider")
|
||||
public void testCallingGeneral(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly ) {
|
||||
testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
|
||||
}
|
||||
|
||||
@DataProvider(name = "polyTestProviderLotsOfAlleles")
|
||||
public Object[][] makepolyTestProviderLotsOfAlleles() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
// list of all high-quality models in the system
|
||||
final List<AFCalcFactory.Calculation> models = Arrays.asList(AFCalcFactory.Calculation.EXACT_INDEPENDENT);
|
||||
|
||||
final List<Integer> alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 20);
|
||||
|
||||
final int nonTypePLs = 1000;
|
||||
final int nAlleles = 4;
|
||||
for ( final AFCalcFactory.Calculation model : models ) {
|
||||
for ( final List<Integer> ACs : Utils.makePermutations(alleleCounts, nAlleles, true) ) {
|
||||
final List<Boolean> isPoly = new ArrayList<Boolean>(ACs.size());
|
||||
for ( final int ac : ACs ) isPoly.add(ac > 0);
|
||||
|
||||
final double acSum = MathUtils.sum(ACs);
|
||||
for ( final int nSamples : Arrays.asList(1, 10, 100) ) {
|
||||
if ( nSamples < acSum ) continue;
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, nAlleles, model, AFCalcTestBuilder.PriorType.human);
|
||||
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, isPoly});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProviderLotsOfAlleles")
|
||||
public void testCallingLotsOfAlleles(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly ) {
|
||||
testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
|
||||
}
|
||||
|
||||
private void testCalling(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly) {
|
||||
final AFCalc calc = testBuilder.makeModel();
|
||||
final double[] priors = testBuilder.makePriors();
|
||||
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
|
||||
final AFCalcResult result = calc.getLog10PNonRef(vc, priors);
|
||||
|
||||
boolean anyPoly = false;
|
||||
for ( final boolean onePoly : expectedPoly ) anyPoly = anyPoly || onePoly;
|
||||
|
||||
if ( anyPoly )
|
||||
Assert.assertTrue(result.getLog10PosteriorOfAFGT0() > -1);
|
||||
|
||||
for ( int altI = 1; altI < result.getAllelesUsedInGenotyping().size(); altI++ ) {
|
||||
final int i = altI - 1;
|
||||
final Allele alt = result.getAllelesUsedInGenotyping().get(altI);
|
||||
|
||||
// must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs
|
||||
Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt));
|
||||
Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,124 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
public class ConstrainedAFCalculationModelUnitTest extends BaseTest {
|
||||
static Allele A = Allele.create("A", true);
|
||||
static Allele C = Allele.create("C");
|
||||
static Allele G = Allele.create("G");
|
||||
|
||||
protected static Genotype makePL(final List<Allele> expectedGT, int ... pls) {
|
||||
return AFCalcUnitTest.makePL(expectedGT, pls);
|
||||
}
|
||||
|
||||
@DataProvider(name = "MaxACsToVisit")
|
||||
public Object[][] makeMaxACsToVisit() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final int nSamples = 10;
|
||||
|
||||
for (int nNonInformative = 0; nNonInformative < nSamples - 1; nNonInformative++ ) {
|
||||
final int nChrom = (nSamples - nNonInformative) * 2;
|
||||
for ( int i = 0; i < nChrom; i++ ) {
|
||||
// bi-allelic
|
||||
tests.add(new Object[]{nSamples, Arrays.asList(i), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
|
||||
|
||||
// tri-allelic
|
||||
for ( int j = 0; j < (nChrom - i); j++)
|
||||
tests.add(new Object[]{nSamples, Arrays.asList(i, j), nNonInformative, AFCalcFactory.Calculation.EXACT_CONSTRAINED});
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "MaxACsToVisit")
|
||||
public void testMaxACsToVisit(final int nSamples, final List<Integer> requestedACs, final int nNonInformative, final AFCalcFactory.Calculation modelType) {
|
||||
final int nAlts = requestedACs.size();
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(nSamples, nAlts, modelType,
|
||||
AFCalcTestBuilder.PriorType.human);
|
||||
|
||||
final VariantContext vc = testBuilder.makeACTest(requestedACs, nNonInformative, 100);
|
||||
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
|
||||
|
||||
testExpectedACs(vc, maxACsToVisit);
|
||||
}
|
||||
|
||||
private void testExpectedACs(final VariantContext vc, final int[] maxACsToVisit) {
|
||||
// this is necessary because cannot ensure that the tester gives us back the
|
||||
// requested ACs due to rounding errors
|
||||
final List<Integer> ACs = new ArrayList<Integer>();
|
||||
for ( final Allele a : vc.getAlternateAlleles() )
|
||||
ACs.add(vc.getCalledChrCount(a));
|
||||
|
||||
for ( int i = 0; i < maxACsToVisit.length; i++ ) {
|
||||
Assert.assertEquals(maxACsToVisit[i], (int)ACs.get(i), "Maximum AC computed wasn't equal to the max possible in the construction for alt allele " + i);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "MaxACsGenotypes")
|
||||
public Object[][] makeMaxACsForGenotype() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<Allele> AA = Arrays.asList(A, A);
|
||||
final List<Allele> AC = Arrays.asList(A, C);
|
||||
final List<Allele> CC = Arrays.asList(C, C);
|
||||
final List<Allele> AG = Arrays.asList(A, G);
|
||||
final List<Allele> GG = Arrays.asList(G, G);
|
||||
final List<Allele> CG = Arrays.asList(C, G);
|
||||
|
||||
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
|
||||
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
|
||||
|
||||
tests.add(new Object[]{vc2, makePL(AA, 0, 10, 10)});
|
||||
tests.add(new Object[]{vc2, makePL(AC, 10, 0, 10)});
|
||||
tests.add(new Object[]{vc2, makePL(CC, 10, 10, 0)});
|
||||
|
||||
// make sure non-informative => 0
|
||||
tests.add(new Object[]{vc2, makePL(AA, 0, 0, 0)});
|
||||
tests.add(new Object[]{vc3, makePL(AA, 0, 0, 0, 0, 0, 0)});
|
||||
|
||||
// multi-allelics
|
||||
tests.add(new Object[]{vc3, makePL(AG, 10, 10, 10, 0, 10, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(CG, 10, 10, 10, 10, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(GG, 10, 10, 10, 10, 10, 0)});
|
||||
|
||||
// deal with non-informatives third alleles
|
||||
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 10, 0, 0)});
|
||||
tests.add(new Object[]{vc3, makePL(AC, 10, 0, 10, 0, 0, 0)});
|
||||
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 10)});
|
||||
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 10, 0, 0)});
|
||||
tests.add(new Object[]{vc3, makePL(CC, 10, 10, 0, 0, 0, 0)});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "MaxACsGenotypes")
|
||||
private void testMakeACByGenotype(final VariantContext vcRoot, final Genotype g) {
|
||||
final VariantContext vc = new VariantContextBuilder(vcRoot).genotypes(g).make();
|
||||
|
||||
final AFCalcTestBuilder testBuilder
|
||||
= new AFCalcTestBuilder(1, vc.getNAlleles()-1, AFCalcFactory.Calculation.EXACT_CONSTRAINED,
|
||||
AFCalcTestBuilder.PriorType.human);
|
||||
|
||||
final int[] maxACsToVisit = ((ConstrainedDiploidExactAFCalc)testBuilder.makeModel()).computeMaxACs(vc);
|
||||
|
||||
testExpectedACs(vc, maxACsToVisit);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
|
|
@ -137,15 +138,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
|
|||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
|
||||
final AFCalcResultTracker resultTracker = new AFCalcResultTracker(cfg.numAltAlleles);
|
||||
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
|
||||
double[] priors = new double[len]; // flat priors
|
||||
|
||||
GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result);
|
||||
GeneralPloidyExactAFCalc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, resultTracker);
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
|
||||
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMAP()[allele];
|
||||
|
||||
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
|
|
@ -0,0 +1,210 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
// SEE private/R/pls.R if you want the truth output for these tests
|
||||
public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
|
||||
@DataProvider(name = "TestCombineGLs")
|
||||
public Object[][] makeTestCombineGLs() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)});
|
||||
tests.add(new Object[]{1, 1, makePL(10, 0, 20), makePL(10, 0, 20)});
|
||||
tests.add(new Object[]{1, 1, makePL(20, 10, 0), makePL(20, 10, 0)});
|
||||
|
||||
// AA AB BB AC BC CC => AA AB+BC CC
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 0, 50, 50, 50, 50), makePL(45, 0, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 0, 50, 50, 50, 50), makePL( 0, 47, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47, 0)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(0, 47, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(45, 0, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(0, 47, 50)});
|
||||
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(45, 47, 0)});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@DataProvider(name = "TestCombineGLsWithDrops")
|
||||
public Object[][] makeTestCombineGLsWithDrops() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final Set<Integer> noDrops = Collections.emptySet();
|
||||
final Set<Integer> drop1 = Collections.singleton(1);
|
||||
final Set<Integer> drop2 = Collections.singleton(2);
|
||||
|
||||
// AA AB BB AC BC CC
|
||||
// drop1 (B): AA AC CC
|
||||
// drop2 (C): AA AB BB
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5), noDrops});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9), noDrops});
|
||||
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 1, 2), drop2});
|
||||
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 3, 5), drop1});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(0, 2, 6), noDrops});
|
||||
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(1, 0, 2), noDrops});
|
||||
tests.add(new Object[]{1, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(2, 1, 0), drop2});
|
||||
tests.add(new Object[]{2, 2, makePL( 5, 4, 3, 2, 1, 0), makePL(5, 2, 0), drop1});
|
||||
|
||||
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 8,11), noDrops});
|
||||
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL( 5, 7, 0), noDrops});
|
||||
tests.add(new Object[]{1, 2, makePL(10,10,10,10,10, 0), makePL( 0, 0, 0), drop2});
|
||||
tests.add(new Object[]{2, 2, makePL(10,10,10,10,10, 0), makePL(10,10, 0), drop1});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
private Genotype makePL(final int ... PLs) {
|
||||
return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs);
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "TestCombineGLs")
|
||||
private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) {
|
||||
testCombineGLsWithDrops(altIndex, nAlts, testg, expected, Collections.<Integer>emptySet());
|
||||
}
|
||||
|
||||
@Test(enabled = true, dataProvider = "TestCombineGLsWithDrops")
|
||||
private void testCombineGLsWithDrops(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected, Set<Integer> allelesToDrop) {
|
||||
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
|
||||
final Genotype combined = calc.combineGLs(testg, altIndex, allelesToDrop, nAlts);
|
||||
|
||||
Assert.assertEquals(combined.getPL(), expected.getPL(),
|
||||
"Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL()));
|
||||
}
|
||||
|
||||
|
||||
static Allele A = Allele.create("A", true);
|
||||
static Allele C = Allele.create("C");
|
||||
static Allele G = Allele.create("G");
|
||||
|
||||
@DataProvider(name = "TestMakeAlleleConditionalContexts")
|
||||
public Object[][] makeTestMakeAlleleConditionalContexts() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A));
|
||||
final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C));
|
||||
final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G));
|
||||
final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G));
|
||||
final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C));
|
||||
|
||||
final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5);
|
||||
final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2);
|
||||
final Genotype gACcombined = makePL(0, 2, 5);
|
||||
final Genotype gAGcombined = makePL(0, 4, 9);
|
||||
final Genotype gACdropped = makePL(0, 1, 2);
|
||||
final Genotype gAGdropped = makePL(0, 3, 5);
|
||||
|
||||
// biallelic
|
||||
tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())});
|
||||
|
||||
// tri-allelic
|
||||
tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGdropped).make())});
|
||||
tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACdropped).make())});
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
|
||||
@Test(enabled = false, dataProvider = "TestMakeAlleleConditionalContexts")
|
||||
private void testMakeAlleleConditionalContexts(final VariantContext vc, final List<VariantContext> expectedVCs) {
|
||||
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
|
||||
final List<VariantContext> biAllelicVCs = calc.makeAlleleConditionalContexts(vc);
|
||||
|
||||
Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size());
|
||||
|
||||
for ( int i = 0; i < biAllelicVCs.size(); i++ ) {
|
||||
final VariantContext actual = biAllelicVCs.get(i);
|
||||
final VariantContext expected = expectedVCs.get(i);
|
||||
Assert.assertEquals(actual.getAlleles(), expected.getAlleles());
|
||||
|
||||
for ( int j = 0; j < actual.getNSamples(); j++ )
|
||||
Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@DataProvider(name = "ThetaNTests")
|
||||
public Object[][] makeThetaNTests() {
|
||||
List<Object[]> tests = new ArrayList<Object[]>();
|
||||
|
||||
final List<Double> log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0);
|
||||
|
||||
for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) {
|
||||
for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) {
|
||||
for ( List<Double> permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) {
|
||||
tests.add(new Object[]{permutations, Math.pow(10, log10pRef)});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return tests.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
@Test(dataProvider = "ThetaNTests")
|
||||
public void testThetaNTests(final List<Double> log10LAlleles, final double pRef) {
|
||||
// biallelic
|
||||
final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef});
|
||||
|
||||
final double log10pNonRef = Math.log10(1-pRef);
|
||||
|
||||
final List<AFCalcResult> originalPriors = new LinkedList<AFCalcResult>();
|
||||
final List<Double> pNonRefN = new LinkedList<Double>();
|
||||
for ( int i = 0; i < log10LAlleles.size(); i++ ) {
|
||||
final double log10LAllele1 = log10LAlleles.get(i);
|
||||
final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true);
|
||||
final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0));
|
||||
originalPriors.add(result1);
|
||||
pNonRefN.add(log10pNonRef*(i+1));
|
||||
}
|
||||
|
||||
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2);
|
||||
final List<AFCalcResult> thetaNPriors = calc.applyMultiAllelicPriors(originalPriors);
|
||||
|
||||
double prevPosterior = 0.0;
|
||||
for ( int i = 0; i < log10LAlleles.size(); i++ ) {
|
||||
final AFCalcResult thetaN = thetaNPriors.get(i);
|
||||
AFCalcResult orig = null;
|
||||
for ( final AFCalcResult x : originalPriors )
|
||||
if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping()))
|
||||
orig = x;
|
||||
|
||||
Assert.assertNotNull(orig, "couldn't find original AFCalc");
|
||||
|
||||
Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6);
|
||||
Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6);
|
||||
|
||||
Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0());
|
||||
prevPosterior = orig.getLog10PosteriorOfAFGT0();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -8,9 +8,10 @@ import java.util.Arrays;
|
|||
public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
|
||||
final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
|
||||
final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
|
||||
//final String RECAL_FILE = validationDataLocation + "NA12878.kmer.8.subset.recal_data.bqsr";
|
||||
|
||||
private void HCTest(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
|
|
@ -20,28 +21,79 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
|||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "6b30c7e1b6bbe80d180d9d67441cec12");
|
||||
HCTest(CEUTRIO_BAM, "", "8c52c0955099cca3215a0d78fd455894");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "4cdfbfeadef00725974828310558d7d4");
|
||||
HCTest(NA12878_BAM, "", "01367428c26d3eaf9297c58bf8677dd3");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "6183fb6e374976d7087150009685e043");
|
||||
HCTest(CEUTRIO_BAM, "--max_alternate_alleles_for_indels 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "61c1a0fb62d909229af6b5a91dad8b35");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 2";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex() {
|
||||
HCTestComplexVariants(CEUTRIO_BAM, "", "ab7593a7a60a2e9a66053572f1718df1");
|
||||
HCTestComplexVariants(CEUTRIO_BAM, "", "30598abeeb0b0ae5816ffdbf0c4044fd");
|
||||
}
|
||||
}
|
||||
|
||||
private void HCTestSymbolicVariants(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 2";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleSymbolic() {
|
||||
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "6eb9c1026225b38ba7bd3c4c218f8269");
|
||||
}
|
||||
|
||||
private void HCTestIndelQualityScores(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
|
||||
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "98d82d74e8d6a778290bee6c0df6d092");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestProblematicReadsModifiedInActiveRegions() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("fa5c5eb996e95aed12c50d70e6dd74d7"));
|
||||
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void HCTestStructuralIndels() {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c642dcd93771f6f084d55de31f180d1b"));
|
||||
executeTest("HCTestStructuralIndels: ", spec);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// testing reduced reads
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Test
|
||||
public void HCTestReducedBam() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
|
||||
Arrays.asList("79af83432dc4a1768b3ebffffc4d2b8f"));
|
||||
executeTest("HC calling on a ReducedRead BAM", spec);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -111,7 +111,13 @@ gsa.read.gatkreportv1 <- function(lines) {
|
|||
headerRowCount = -1;
|
||||
|
||||
finishTable <- function() {
|
||||
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows[1:rowCount,], tableEnv);
|
||||
if ( rowCount == 1 )
|
||||
# good I hate R. Work around to avoid collapsing into an unstructured vector when
|
||||
# there's only 1 row
|
||||
sub <- t(as.matrix(tableRows[1:rowCount,]))
|
||||
else
|
||||
sub <- tableRows[1:rowCount,]
|
||||
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, sub, tableEnv);
|
||||
}
|
||||
|
||||
for (line in lines) {
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
library(gplots)
|
||||
library(ggplot2)
|
||||
library(tools)
|
||||
|
||||
# -------------------------------------------------------
|
||||
# Utilities for displaying multiple plots per page
|
||||
|
|
@ -59,6 +60,7 @@ closePDF <- function(outputPDF) {
|
|||
if ( ! is.na(outputPDF) ) {
|
||||
dev.off()
|
||||
if (exists("compactPDF")) {
|
||||
print("compacting PDF")
|
||||
compactPDF(outputPDF)
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -245,7 +245,7 @@ public class FastaSequenceIndexBuilder {
|
|||
* Reset iterators and add contig to sequence index
|
||||
*/
|
||||
private void finishReadingContig(FastaSequenceIndex sequenceIndex) {
|
||||
sequenceIndex.add(new FastaSequenceIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
|
||||
sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
|
||||
status = Status.NONE;
|
||||
contig = "";
|
||||
size = 0;
|
||||
|
|
@ -258,6 +258,14 @@ public class FastaSequenceIndexBuilder {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Trims the contig name to the expected value by removing any characters after the first whitespace
|
||||
*/
|
||||
private static String trimContigName(final String contigName) {
|
||||
int whitespaceIndex = contigName.indexOf(' ');
|
||||
return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores FastaSequenceIndex as a .fasta.fai file on local machine
|
||||
* Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder
|
||||
|
|
|
|||
|
|
@ -125,6 +125,37 @@ public class GATKBAMFileSpan extends BAMFileSpan {
|
|||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a GATKChunk representing the "extent" of this file span, from the start of the first
|
||||
* chunk to the end of the last chunk.The chunks list must be sorted in order to use this method.
|
||||
*
|
||||
* @return a GATKChunk representing the extent of this file span, or a GATKChunk representing
|
||||
* a span of size 0 if there are no chunks
|
||||
*/
|
||||
public GATKChunk getExtent() {
|
||||
validateSorted(); // TODO: defensive measure: may be unnecessary
|
||||
|
||||
List<Chunk> chunks = getChunks();
|
||||
if ( chunks.isEmpty() ) {
|
||||
return new GATKChunk(0L, 0L);
|
||||
}
|
||||
|
||||
return new GATKChunk(chunks.get(0).getChunkStart(), chunks.get(chunks.size() - 1).getChunkEnd());
|
||||
}
|
||||
|
||||
/**
|
||||
* Validates the list of chunks to ensure that they appear in sorted order.
|
||||
*/
|
||||
private void validateSorted() {
|
||||
List<Chunk> chunks = getChunks();
|
||||
for ( int i = 1; i < chunks.size(); i++ ) {
|
||||
if ( chunks.get(i).getChunkStart() < chunks.get(i-1).getChunkEnd() ) {
|
||||
throw new ReviewedStingException(String.format("Chunk list is unsorted; chunk %s is before chunk %s", chunks.get(i-1), chunks.get(i)));
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the union of two FileSpans.
|
||||
* @param other FileSpan to union with this one.
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
|||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
|
@ -81,7 +81,7 @@ public class AlignmentValidation extends ReadWalker<Integer,Integer> {
|
|||
* @return Number of reads aligned by this map (aka 1).
|
||||
*/
|
||||
@Override
|
||||
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
|
||||
//logger.info(String.format("examining read %s", read.getReadName()));
|
||||
|
||||
byte[] bases = read.getReadBases();
|
||||
|
|
|
|||
|
|
@ -1,139 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFileFactory;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.WalkerName;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format.
|
||||
* Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
@WalkerName("Align")
|
||||
public class AlignmentWalker extends ReadWalker<Integer,Integer> {
|
||||
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
|
||||
"generated by bwa index -d bwtsw. If unspecified, will default " +
|
||||
"to the reference specified via the -R argument.",required=false)
|
||||
private File targetReferenceFile = null;
|
||||
|
||||
@Output
|
||||
private StingSAMFileWriter out = null;
|
||||
|
||||
/**
|
||||
* The actual aligner.
|
||||
*/
|
||||
private BWACAligner aligner = null;
|
||||
|
||||
/**
|
||||
* New header to use, if desired.
|
||||
*/
|
||||
private SAMFileHeader header;
|
||||
|
||||
/**
|
||||
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
if(targetReferenceFile == null)
|
||||
targetReferenceFile = getToolkit().getArguments().referenceFile;
|
||||
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
|
||||
BWAConfiguration configuration = new BWAConfiguration();
|
||||
aligner = new BWACAligner(bwtFiles,configuration);
|
||||
|
||||
// Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
|
||||
header = getToolkit().getSAMFileHeader().clone();
|
||||
SAMSequenceDictionary referenceDictionary =
|
||||
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
|
||||
header.setSequenceDictionary(referenceDictionary);
|
||||
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
|
||||
|
||||
out.writeHeader(header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Aligns a read to the given reference.
|
||||
*
|
||||
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||
* @param read Read to align.
|
||||
* @return Number of alignments found for this read.
|
||||
*/
|
||||
@Override
|
||||
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
SAMRecord alignedRead = aligner.align(read,header);
|
||||
out.addAlignment(alignedRead);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initial value for reduce. In this case, alignments will be counted.
|
||||
* @return 0, indicating no alignments yet found.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
/**
|
||||
* Calculates the number of alignments found.
|
||||
* @param value Number of alignments found by this map.
|
||||
* @param sum Number of alignments found before this map.
|
||||
* @return Number of alignments found up to and including this map.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup.
|
||||
* @param result Number of reads processed.
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
aligner.close();
|
||||
super.onTraversalDone(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,132 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.alignment;
|
||||
|
||||
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
|
||||
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
|
||||
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.SortedMap;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/**
|
||||
* Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the
|
||||
* frequency of that number of placements.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
|
||||
public class CountBestAlignments extends ReadWalker<Integer,Integer> {
|
||||
/**
|
||||
* The supporting BWT index generated using BWT.
|
||||
*/
|
||||
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
|
||||
private String prefix = null;
|
||||
|
||||
@Output
|
||||
private PrintStream out = null;
|
||||
|
||||
/**
|
||||
* The actual aligner.
|
||||
*/
|
||||
private Aligner aligner = null;
|
||||
|
||||
private SortedMap<Integer,Integer> alignmentFrequencies = new TreeMap<Integer,Integer>();
|
||||
|
||||
/**
|
||||
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
if(prefix == null)
|
||||
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
|
||||
BWTFiles bwtFiles = new BWTFiles(prefix);
|
||||
BWAConfiguration configuration = new BWAConfiguration();
|
||||
aligner = new BWACAligner(bwtFiles,configuration);
|
||||
}
|
||||
|
||||
/**
|
||||
* Aligns a read to the given reference.
|
||||
*
|
||||
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
|
||||
* @param read Read to align.
|
||||
* @return Number of alignments found for this read.
|
||||
*/
|
||||
@Override
|
||||
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
Iterator<Alignment[]> alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
|
||||
if(alignmentIterator.hasNext()) {
|
||||
int numAlignments = alignmentIterator.next().length;
|
||||
if(alignmentFrequencies.containsKey(numAlignments))
|
||||
alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
|
||||
else
|
||||
alignmentFrequencies.put(numAlignments,1);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initial value for reduce. In this case, validated reads will be counted.
|
||||
* @return 0, indicating no reads yet validated.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
/**
|
||||
* Calculates the number of reads processed.
|
||||
* @param value Number of reads processed by this map.
|
||||
* @param sum Number of reads processed before this map.
|
||||
* @return Number of reads processed up to and including this map.
|
||||
*/
|
||||
@Override
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return value + sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cleanup.
|
||||
* @param result Number of reads processed.
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
aligner.close();
|
||||
for(Map.Entry<Integer,Integer> alignmentFrequency: alignmentFrequencies.entrySet())
|
||||
out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
|
||||
super.onTraversalDone(result);
|
||||
}
|
||||
}
|
||||
|
|
@ -46,7 +46,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
|
|||
/**
|
||||
* Maps indices of command line arguments to values paired with that argument.
|
||||
*/
|
||||
public final SortedMap<ArgumentMatchSite,List<String>> sites = new TreeMap<ArgumentMatchSite,List<String>>();
|
||||
public final SortedMap<ArgumentMatchSite,List<ArgumentMatchValue>> sites = new TreeMap<ArgumentMatchSite,List<ArgumentMatchValue>>();
|
||||
|
||||
/**
|
||||
* An ordered, freeform collection of tags.
|
||||
|
|
@ -90,11 +90,11 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
|
|||
* @param value Value for the argument at this position.
|
||||
* @param tags ordered freeform text tags associated with this argument.
|
||||
*/
|
||||
private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final String value, final Tags tags) {
|
||||
private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final ArgumentMatchValue value, final Tags tags) {
|
||||
this.label = label;
|
||||
this.definition = definition;
|
||||
|
||||
ArrayList<String> values = new ArrayList<String>();
|
||||
ArrayList<ArgumentMatchValue> values = new ArrayList<ArgumentMatchValue>();
|
||||
if( value != null )
|
||||
values.add(value);
|
||||
sites.put(site,values );
|
||||
|
|
@ -131,11 +131,11 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
|
|||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
ArgumentMatch transform(Multiplexer multiplexer, Object key) {
|
||||
SortedMap<ArgumentMatchSite,List<String>> newIndices = new TreeMap<ArgumentMatchSite,List<String>>();
|
||||
for(Map.Entry<ArgumentMatchSite,List<String>> site: sites.entrySet()) {
|
||||
List<String> newEntries = new ArrayList<String>();
|
||||
for(String entry: site.getValue())
|
||||
newEntries.add(multiplexer.transformArgument(key,entry));
|
||||
SortedMap<ArgumentMatchSite,List<ArgumentMatchValue>> newIndices = new TreeMap<ArgumentMatchSite,List<ArgumentMatchValue>>();
|
||||
for(Map.Entry<ArgumentMatchSite,List<ArgumentMatchValue>> site: sites.entrySet()) {
|
||||
List<ArgumentMatchValue> newEntries = new ArrayList<ArgumentMatchValue>();
|
||||
for(ArgumentMatchValue entry: site.getValue())
|
||||
newEntries.add(new ArgumentMatchStringValue(multiplexer.transformArgument(key,entry.asString())));
|
||||
newIndices.put(site.getKey(),newEntries);
|
||||
}
|
||||
ArgumentMatch newArgumentMatch = new ArgumentMatch(label,definition);
|
||||
|
|
@ -165,7 +165,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
|
|||
/**
|
||||
* Iterate over each available token.
|
||||
*/
|
||||
private Iterator<String> tokenIterator = null;
|
||||
private Iterator<ArgumentMatchValue> tokenIterator = null;
|
||||
|
||||
/**
|
||||
* The next site to return. Null if none remain.
|
||||
|
|
@ -175,7 +175,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
|
|||
/**
|
||||
* The next token to return. Null if none remain.
|
||||
*/
|
||||
String nextToken = null;
|
||||
ArgumentMatchValue nextToken = null;
|
||||
|
||||
{
|
||||
siteIterator = sites.keySet().iterator();
|
||||
|
|
@ -254,9 +254,9 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
|
|||
* @param site site of the command-line argument to which this value is mated.
|
||||
* @param value Text representation of value to add.
|
||||
*/
|
||||
public void addValue( ArgumentMatchSite site, String value ) {
|
||||
public void addValue( ArgumentMatchSite site, ArgumentMatchValue value ) {
|
||||
if( !sites.containsKey(site) || sites.get(site) == null )
|
||||
sites.put(site, new ArrayList<String>() );
|
||||
sites.put(site, new ArrayList<ArgumentMatchValue>() );
|
||||
sites.get(site).add(value);
|
||||
}
|
||||
|
||||
|
|
@ -275,8 +275,8 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
|
|||
* Return the values associated with this argument match.
|
||||
* @return A collection of the string representation of these value.
|
||||
*/
|
||||
public List<String> values() {
|
||||
List<String> values = new ArrayList<String>();
|
||||
public List<ArgumentMatchValue> values() {
|
||||
List<ArgumentMatchValue> values = new ArrayList<ArgumentMatchValue>();
|
||||
for( ArgumentMatchSite site: sites.keySet() ) {
|
||||
if( sites.get(site) != null )
|
||||
values.addAll(sites.get(site));
|
||||
|
|
|
|||
|
|
@ -0,0 +1,27 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Holds a reference to a file as an argument match value.
|
||||
*
|
||||
* This is useful when the type of the stored file may be a subclass of java.io.File,
|
||||
* for example a Queue RemoteFile.
|
||||
*/
|
||||
public class ArgumentMatchFileValue extends ArgumentMatchValue {
|
||||
private final File file;
|
||||
|
||||
public ArgumentMatchFileValue(File file) {
|
||||
this.file = file;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String asString() {
|
||||
return file == null ? null : file.getAbsolutePath();
|
||||
}
|
||||
|
||||
@Override
|
||||
public File asFile() {
|
||||
return file;
|
||||
}
|
||||
}
|
||||
|
|
@ -24,38 +24,36 @@
|
|||
|
||||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Where an argument match originated, via the commandline or a file.
|
||||
* Where an argument match originated, via the commandline or a custom provider.
|
||||
*/
|
||||
public class ArgumentMatchSource implements Comparable<ArgumentMatchSource> {
|
||||
public static final ArgumentMatchSource COMMAND_LINE = new ArgumentMatchSource(ArgumentMatchSourceType.CommandLine, null);
|
||||
|
||||
private final ArgumentMatchSourceType type;
|
||||
private final File file;
|
||||
private final String description;
|
||||
|
||||
/**
|
||||
* Creates an argument match source from the specified file.
|
||||
* @param file File specifying the arguments. Must not be null.
|
||||
* @param description Where the arguments originated.
|
||||
*/
|
||||
public ArgumentMatchSource(File file) {
|
||||
this(ArgumentMatchSourceType.File, file);
|
||||
public ArgumentMatchSource(String description) {
|
||||
this(ArgumentMatchSourceType.Provider, description);
|
||||
}
|
||||
|
||||
private ArgumentMatchSource(ArgumentMatchSourceType type, File file) {
|
||||
if (type == ArgumentMatchSourceType.File && file == null)
|
||||
throw new IllegalArgumentException("An argument match source of type File cannot have a null file.");
|
||||
private ArgumentMatchSource(ArgumentMatchSourceType type, String description) {
|
||||
if (type == ArgumentMatchSourceType.Provider && description == null)
|
||||
throw new IllegalArgumentException("An argument match source provider cannot have a null description.");
|
||||
this.type = type;
|
||||
this.file = file;
|
||||
this.description = description;
|
||||
}
|
||||
|
||||
public ArgumentMatchSourceType getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public File getFile() {
|
||||
return file;
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
@ -65,13 +63,13 @@ public class ArgumentMatchSource implements Comparable<ArgumentMatchSource> {
|
|||
|
||||
ArgumentMatchSource that = (ArgumentMatchSource) o;
|
||||
|
||||
return (type == that.type) && (file == null ? that.file == null : file.equals(that.file));
|
||||
return (type == that.type) && (description == null ? that.description == null : description.equals(that.description));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = type != null ? type.hashCode() : 0;
|
||||
result = 31 * result + (file != null ? file.hashCode() : 0);
|
||||
result = 31 * result + (description != null ? description.hashCode() : 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -84,15 +82,15 @@ public class ArgumentMatchSource implements Comparable<ArgumentMatchSource> {
|
|||
if (comp != 0)
|
||||
return comp;
|
||||
|
||||
File f1 = this.file;
|
||||
File f2 = that.file;
|
||||
String d1 = this.description;
|
||||
String d2 = that.description;
|
||||
|
||||
if ((f1 == null) ^ (f2 == null)) {
|
||||
// If one of the files is null and the other is not
|
||||
// put the null file first
|
||||
return f1 == null ? -1 : 1;
|
||||
if ((d1 == null) ^ (d2 == null)) {
|
||||
// If one of the descriptions is null and the other is not
|
||||
// put the null description first
|
||||
return d1 == null ? -1 : 1;
|
||||
}
|
||||
|
||||
return f1 == null ? 0 : f1.compareTo(f2);
|
||||
return d1 == null ? 0 : d1.compareTo(d2);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,8 +25,8 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
/**
|
||||
* Type of where an argument match originated, via the commandline or a file.
|
||||
* Type of where an argument match originated, via the commandline or a some other provider.
|
||||
*/
|
||||
public enum ArgumentMatchSourceType {
|
||||
CommandLine, File
|
||||
CommandLine, Provider
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,24 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Argument values that originated from a string.
|
||||
*/
|
||||
public class ArgumentMatchStringValue extends ArgumentMatchValue {
|
||||
private final String value;
|
||||
|
||||
public ArgumentMatchStringValue(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String asString() {
|
||||
return value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public File asFile() {
|
||||
return value == null ? null : new File(value);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,18 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Returns argument values as either strings or values.
|
||||
*/
|
||||
public abstract class ArgumentMatchValue {
|
||||
/**
|
||||
* @return the value of this argument as a String object.
|
||||
*/
|
||||
public abstract String asString();
|
||||
|
||||
/**
|
||||
* @return the value of this argument as a File object.
|
||||
*/
|
||||
public abstract File asFile();
|
||||
}
|
||||
|
|
@ -215,8 +215,8 @@ public abstract class ArgumentTypeDescriptor {
|
|||
* @param matches The matches for the given argument.
|
||||
* @return The value of the argument if available, or null if not present.
|
||||
*/
|
||||
protected String getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) {
|
||||
Collection<String> argumentValues = getArgumentValues( definition, matches );
|
||||
protected ArgumentMatchValue getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) {
|
||||
Collection<ArgumentMatchValue> argumentValues = getArgumentValues( definition, matches );
|
||||
if( argumentValues.size() > 1 )
|
||||
throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName);
|
||||
return argumentValues.size() > 0 ? argumentValues.iterator().next() : null;
|
||||
|
|
@ -244,8 +244,8 @@ public abstract class ArgumentTypeDescriptor {
|
|||
* @param matches The matches for the given argument.
|
||||
* @return The value of the argument if available, or an empty collection if not present.
|
||||
*/
|
||||
protected Collection<String> getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) {
|
||||
Collection<String> values = new ArrayList<String>();
|
||||
protected Collection<ArgumentMatchValue> getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) {
|
||||
Collection<ArgumentMatchValue> values = new ArrayList<ArgumentMatchValue>();
|
||||
for( ArgumentMatch match: matches ) {
|
||||
if( match.definition.equals(definition) )
|
||||
values.addAll(match.values());
|
||||
|
|
@ -310,7 +310,7 @@ public abstract class ArgumentTypeDescriptor {
|
|||
*/
|
||||
protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) {
|
||||
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
|
||||
String value = getArgumentValue(defaultDefinition, matches);
|
||||
ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches);
|
||||
@SuppressWarnings("unchecked")
|
||||
Class<? extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type);
|
||||
String name = defaultDefinition.fullName;
|
||||
|
|
@ -328,7 +328,7 @@ public abstract class ArgumentTypeDescriptor {
|
|||
* @param fieldName The name of the field that was parsed. Used for error reporting.
|
||||
* @return The newly created binding object of type bindingClass.
|
||||
*/
|
||||
public static Object parseBinding(String value, Class<? extends Feature> parameterType, Type bindingClass,
|
||||
public static Object parseBinding(ArgumentMatchValue value, Class<? extends Feature> parameterType, Type bindingClass,
|
||||
String bindingName, Tags tags, String fieldName) {
|
||||
try {
|
||||
String tribbleType = null;
|
||||
|
|
@ -337,7 +337,7 @@ public abstract class ArgumentTypeDescriptor {
|
|||
throw new UserException.CommandLineException(
|
||||
String.format("Unexpected number of positional tags for argument %s : %s. " +
|
||||
"Rod bindings only support -X:type and -X:name,type argument styles",
|
||||
value, fieldName));
|
||||
value.asString(), fieldName));
|
||||
} else if ( tags.getPositionalTags().size() == 2 ) {
|
||||
// -X:name,type style
|
||||
bindingName = tags.getPositionalTags().get(0);
|
||||
|
|
@ -366,7 +366,7 @@ public abstract class ArgumentTypeDescriptor {
|
|||
|
||||
if ( tribbleType == null ) {
|
||||
// try to determine the file type dynamically
|
||||
File file = new File(value);
|
||||
File file = value.asFile();
|
||||
if ( file.canRead() && file.isFile() ) {
|
||||
FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file);
|
||||
if ( featureDescriptor != null ) {
|
||||
|
|
@ -379,7 +379,7 @@ public abstract class ArgumentTypeDescriptor {
|
|||
// IntervalBinding can be created from a normal String
|
||||
Class rawType = (makeRawTypeIfNecessary(bindingClass));
|
||||
try {
|
||||
return rawType.getConstructor(String.class).newInstance(value);
|
||||
return rawType.getConstructor(String.class).newInstance(value.asString());
|
||||
} catch (NoSuchMethodException e) {
|
||||
/* ignore */
|
||||
}
|
||||
|
|
@ -399,14 +399,14 @@ public abstract class ArgumentTypeDescriptor {
|
|||
}
|
||||
|
||||
Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
|
||||
return ctor.newInstance(parameterType, bindingName, value, tribbleType, tags);
|
||||
return ctor.newInstance(parameterType, bindingName, value.asString(), tribbleType, tags);
|
||||
} catch (Exception e) {
|
||||
if ( e instanceof UserException )
|
||||
throw ((UserException)e);
|
||||
else
|
||||
throw new UserException.CommandLineException(
|
||||
String.format("Failed to parse value %s for argument %s. Message: %s",
|
||||
value, fieldName, e.getMessage()));
|
||||
value.asString(), fieldName, e.getMessage()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -517,7 +517,7 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
return true;
|
||||
|
||||
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
|
||||
String value = getArgumentValue( defaultDefinition, matches );
|
||||
ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches);
|
||||
Object result;
|
||||
Tags tags = getArgumentTags(matches);
|
||||
|
||||
|
|
@ -527,12 +527,12 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class);
|
||||
if(value == null)
|
||||
throw new MissingArgumentValueException(createDefaultArgumentDefinition(source));
|
||||
result = valueOf.invoke(null,value.trim());
|
||||
result = valueOf.invoke(null,value.asString().trim());
|
||||
} else if (type.isEnum()) {
|
||||
Object[] vals = type.getEnumConstants();
|
||||
Object defaultEnumeration = null; // as we look at options, record the default option if it exists
|
||||
for (Object val : vals) {
|
||||
if (String.valueOf(val).equalsIgnoreCase(value)) return val;
|
||||
if (String.valueOf(val).equalsIgnoreCase(value.asString())) return val;
|
||||
try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; }
|
||||
catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); }
|
||||
}
|
||||
|
|
@ -544,10 +544,12 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
else if (value == null)
|
||||
throw new MissingArgumentValueException(createDefaultArgumentDefinition(source));
|
||||
else
|
||||
throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value);
|
||||
throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString());
|
||||
} else if (type.equals(File.class)) {
|
||||
result = value.asFile();
|
||||
} else {
|
||||
Constructor ctor = type.getConstructor(String.class);
|
||||
result = ctor.newInstance(value);
|
||||
result = ctor.newInstance(value.asString());
|
||||
}
|
||||
} catch (UserException e) {
|
||||
throw e;
|
||||
|
|
|
|||
|
|
@ -174,7 +174,7 @@ public abstract class CommandLineProgram {
|
|||
ParsingEngine parser = clp.parser = new ParsingEngine(clp);
|
||||
parser.addArgumentSource(clp.getClass());
|
||||
|
||||
Map<ArgumentMatchSource, List<String>> parsedArgs;
|
||||
Map<ArgumentMatchSource, ParsedArgs> parsedArgs;
|
||||
|
||||
// process the args
|
||||
if (clp.canAddArgumentsDynamically()) {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,13 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
/**
|
||||
* Represents a collection of parsed arguments for an argument source.
|
||||
*
|
||||
* Useful for printing out help documents.
|
||||
*/
|
||||
public abstract class ParsedArgs {
|
||||
/**
|
||||
* @return A compact description of the arguments from an provider/source.
|
||||
*/
|
||||
public abstract String getDescription();
|
||||
}
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A list of string arguments, usually from the command line or an args list file.
|
||||
*/
|
||||
public class ParsedListArgs extends ParsedArgs {
|
||||
private final List<String> args = new ArrayList<String>();
|
||||
|
||||
public ParsedListArgs() {
|
||||
}
|
||||
|
||||
public ParsedListArgs(List<String> args) {
|
||||
this.args.addAll(args);
|
||||
}
|
||||
|
||||
public void add(String... args) {
|
||||
this.args.addAll(Arrays.asList(args));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getDescription() {
|
||||
return StringUtils.join(this.args, " ");
|
||||
}
|
||||
}
|
||||
|
|
@ -30,6 +30,7 @@ import org.apache.commons.io.FileUtils;
|
|||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.classloader.JVMUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
|
@ -61,7 +62,7 @@ public class ParsingEngine {
|
|||
* Indicates as best as possible where command-line text remains unmatched
|
||||
* to existing arguments.
|
||||
*/
|
||||
ArgumentMatches argumentMatches = null;
|
||||
private ArgumentMatches argumentMatches = null;
|
||||
|
||||
/**
|
||||
* Techniques for parsing and for argument lookup.
|
||||
|
|
@ -88,7 +89,10 @@ public class ParsingEngine {
|
|||
/**
|
||||
* List of tags associated with the given instantiation of the command-line argument.
|
||||
*/
|
||||
private final Map<Object,Tags> tags = new IdentityHashMap<Object,Tags>();
|
||||
private final Map<Object,Tags> tags = new IdentityHashMap<Object,Tags>();
|
||||
|
||||
private PluginManager<ParsingEngineArgumentProvider> argumentProviderPluginManager =
|
||||
new PluginManager<ParsingEngineArgumentProvider>(ParsingEngineArgumentProvider.class);
|
||||
|
||||
/**
|
||||
* our log, which we want to capture anything from org.broadinstitute.sting
|
||||
|
|
@ -105,7 +109,10 @@ public class ParsingEngine {
|
|||
argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors());
|
||||
argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS);
|
||||
|
||||
addArgumentSource(ParsingEngineArgumentFiles.class);
|
||||
List<Class<? extends ParsingEngineArgumentProvider>> providers = argumentProviderPluginManager.getPlugins();
|
||||
for (Class<? extends ParsingEngineArgumentProvider> provider: providers) {
|
||||
addArgumentSource(provider);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -117,6 +124,10 @@ public class ParsingEngine {
|
|||
addArgumentSource(null, source);
|
||||
}
|
||||
|
||||
public ArgumentMatches getArgumentMatches() {
|
||||
return argumentMatches;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add an argument source. Argument sources are expected to have
|
||||
* any number of fields with an @Argument annotation attached.
|
||||
|
|
@ -156,29 +167,30 @@ public class ParsingEngine {
|
|||
* @param tokens Tokens passed on the command line.
|
||||
* @return The parsed arguments by file.
|
||||
*/
|
||||
public SortedMap<ArgumentMatchSource, List<String>> parse( String[] tokens ) {
|
||||
public SortedMap<ArgumentMatchSource, ParsedArgs> parse( String[] tokens ) {
|
||||
argumentMatches = new ArgumentMatches();
|
||||
SortedMap<ArgumentMatchSource, List<String>> parsedArgs = new TreeMap<ArgumentMatchSource, List<String>>();
|
||||
SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs = new TreeMap<ArgumentMatchSource, ParsedArgs>();
|
||||
|
||||
List<String> cmdLineTokens = Arrays.asList(tokens);
|
||||
parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs);
|
||||
|
||||
ParsingEngineArgumentFiles argumentFiles = new ParsingEngineArgumentFiles();
|
||||
List<ParsingEngineArgumentProvider> providers = argumentProviderPluginManager.createAllTypes();
|
||||
|
||||
// Load the arguments ONLY into the argument files.
|
||||
// Validation may optionally run on the rest of the arguments.
|
||||
loadArgumentsIntoObject(argumentFiles);
|
||||
for (ParsingEngineArgumentProvider provider: providers) {
|
||||
// Load the arguments ONLY into the provider.
|
||||
// Validation may optionally run on the rest of the arguments.
|
||||
loadArgumentsIntoObject(provider);
|
||||
}
|
||||
|
||||
for (File file: argumentFiles.files) {
|
||||
List<String> fileTokens = getArguments(file);
|
||||
parse(new ArgumentMatchSource(file), fileTokens, argumentMatches, parsedArgs);
|
||||
for (ParsingEngineArgumentProvider provider: providers) {
|
||||
provider.parse(this, parsedArgs);
|
||||
}
|
||||
|
||||
return parsedArgs;
|
||||
}
|
||||
|
||||
private void parse(ArgumentMatchSource matchSource, List<String> tokens,
|
||||
ArgumentMatches argumentMatches, SortedMap<ArgumentMatchSource, List<String>> parsedArgs) {
|
||||
public void parse(ArgumentMatchSource matchSource, List<String> tokens,
|
||||
ArgumentMatches argumentMatches, SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs) {
|
||||
ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1);
|
||||
|
||||
int i = 0;
|
||||
|
|
@ -195,19 +207,44 @@ public class ParsingEngine {
|
|||
}
|
||||
else {
|
||||
if( argumentMatches.hasMatch(lastArgumentMatchSite) &&
|
||||
!argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite))
|
||||
argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, token );
|
||||
!argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite))
|
||||
argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, new ArgumentMatchStringValue(token) );
|
||||
else
|
||||
argumentMatches.MissingArgument.addValue( site, token );
|
||||
argumentMatches.MissingArgument.addValue( site, new ArgumentMatchStringValue(token) );
|
||||
|
||||
}
|
||||
i++;
|
||||
}
|
||||
|
||||
parsedArgs.put(matchSource, tokens);
|
||||
parsedArgs.put(matchSource, new ParsedListArgs(tokens));
|
||||
}
|
||||
|
||||
private List<String> getArguments(File file) {
|
||||
public void parsePairs(ArgumentMatchSource matchSource, List<Pair<String, ArgumentMatchValue>> tokens,
|
||||
ArgumentMatches argumentMatches, ParsedArgs matchSourceArgs,
|
||||
SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs) {
|
||||
int i = 0;
|
||||
for (Pair<String, ArgumentMatchValue> pair: tokens) {
|
||||
|
||||
ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i);
|
||||
List<DefinitionMatcher> matchers = Arrays.asList(ArgumentDefinitions.FullNameDefinitionMatcher, ArgumentDefinitions.ShortNameDefinitionMatcher);
|
||||
ArgumentDefinition definition = null;
|
||||
for (DefinitionMatcher matcher: matchers) {
|
||||
definition = argumentDefinitions.findArgumentDefinition( pair.getFirst(), matcher );
|
||||
if (definition != null)
|
||||
break;
|
||||
}
|
||||
if (definition == null)
|
||||
continue;
|
||||
ArgumentMatch argumentMatch = new ArgumentMatch(pair.getFirst(), definition, site, new Tags());
|
||||
argumentMatches.mergeInto(argumentMatch);
|
||||
argumentMatch.addValue(site, pair.getSecond());
|
||||
i++;
|
||||
}
|
||||
|
||||
parsedArgs.put(matchSource, matchSourceArgs);
|
||||
}
|
||||
|
||||
protected List<String> getArguments(File file) {
|
||||
try {
|
||||
if (file.getAbsolutePath().endsWith(".list")) {
|
||||
return getListArguments(file);
|
||||
|
|
@ -283,9 +320,9 @@ public class ParsingEngine {
|
|||
|
||||
// Ensure that the field contents meet the validation criteria specified by the regular expression.
|
||||
for( ArgumentMatch verifiableMatch: verifiableMatches ) {
|
||||
for( String value: verifiableMatch.values() ) {
|
||||
if( verifiableArgument.validation != null && !value.matches(verifiableArgument.validation) )
|
||||
invalidValues.add( new Pair<ArgumentDefinition,String>(verifiableArgument, value) );
|
||||
for( ArgumentMatchValue value: verifiableMatch.values() ) {
|
||||
if( verifiableArgument.validation != null && !value.asString().matches(verifiableArgument.validation) )
|
||||
invalidValues.add( new Pair<ArgumentDefinition,String>(verifiableArgument, value.asString()) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -629,21 +666,21 @@ class UnmatchedArgumentException extends ArgumentException {
|
|||
private static String formatArguments( ArgumentMatch invalidValues ) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for( ArgumentMatchSite site: invalidValues.sites.keySet() )
|
||||
for( String value: invalidValues.sites.get(site) ) {
|
||||
for( ArgumentMatchValue value: invalidValues.sites.get(site) ) {
|
||||
switch (site.getSource().getType()) {
|
||||
case CommandLine:
|
||||
sb.append( String.format("%nInvalid argument value '%s' at position %d.",
|
||||
value, site.getIndex()) );
|
||||
value.asString(), site.getIndex()) );
|
||||
break;
|
||||
case File:
|
||||
sb.append( String.format("%nInvalid argument value '%s' in file %s at position %d.",
|
||||
value, site.getSource().getFile().getAbsolutePath(), site.getIndex()) );
|
||||
case Provider:
|
||||
sb.append( String.format("%nInvalid argument value '%s' in %s at position %d.",
|
||||
value.asString(), site.getSource().getDescription(), site.getIndex()) );
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException( String.format("Unexpected argument match source type: %s",
|
||||
site.getSource().getType()));
|
||||
}
|
||||
if(value != null && Utils.dupString(' ',value.length()).equals(value))
|
||||
if(value.asString() != null && Utils.dupString(' ',value.asString().length()).equals(value.asString()))
|
||||
sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace.");
|
||||
}
|
||||
return sb.toString();
|
||||
|
|
@ -696,12 +733,3 @@ class UnknownEnumeratedValueException extends ArgumentException {
|
|||
return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Container class to store the list of argument files.
|
||||
* The files will be parsed after the command line arguments.
|
||||
*/
|
||||
class ParsingEngineArgumentFiles {
|
||||
@Argument(fullName = "arg_file", shortName = "args", doc = "Reads arguments from the specified file", required = false)
|
||||
public List<File> files = new ArrayList<File>();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,30 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.SortedMap;
|
||||
|
||||
/**
|
||||
* Container class to store the list of argument files.
|
||||
* The files will be parsed after the command line arguments.
|
||||
*/
|
||||
public class ParsingEngineArgumentFiles extends ParsingEngineArgumentProvider {
|
||||
@Argument(fullName = "arg_file", shortName = "args", doc = "Reads arguments from the specified file", required = false)
|
||||
public List<File> files = new ArrayList<File>();
|
||||
|
||||
@Override
|
||||
public void parse(ParsingEngine parsingEngine, SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs) {
|
||||
ArgumentMatches argumentMatches = parsingEngine.getArgumentMatches();
|
||||
for (File file: this.files) {
|
||||
List<String> fileTokens = parsingEngine.getArguments(file);
|
||||
parsingEngine.parse(new ArgumentMatchFileSource(file), fileTokens, argumentMatches, parsedArgs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class ArgumentMatchFileSource extends ArgumentMatchSource {
|
||||
ArgumentMatchFileSource(File file) {
|
||||
super("file " + file.getAbsolutePath());
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.SortedMap;
|
||||
|
||||
/**
|
||||
* A class that can parse arguments for the engine
|
||||
*/
|
||||
public abstract class ParsingEngineArgumentProvider {
|
||||
public abstract void parse(ParsingEngine parsingEngine, SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs);
|
||||
}
|
||||
|
||||
|
|
@ -117,6 +117,15 @@ public final class RodBinding<T extends Feature> {
|
|||
this.bound = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName
|
||||
* @param type
|
||||
* @param rawName
|
||||
*/
|
||||
public RodBinding(Class<T> type, final String rawName) {
|
||||
this(type, rawName, "missing", type.getSimpleName(), new Tags());
|
||||
}
|
||||
|
||||
/**
|
||||
* Make an unbound RodBinding<T>. Only available for creating the globally unique UNBOUND object
|
||||
* @param type class this unbound RodBinding creates
|
||||
|
|
|
|||
|
|
@ -112,31 +112,38 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
}
|
||||
}
|
||||
|
||||
protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
|
||||
protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
|
||||
public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
|
||||
public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
|
||||
public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device";
|
||||
public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded";
|
||||
|
||||
private static void checkForMaskedUserErrors(final Throwable t) {
|
||||
final String message = t.getMessage();
|
||||
if ( message == null )
|
||||
return;
|
||||
|
||||
// we know what to do about the common "Too many open files" error
|
||||
if ( message.indexOf("Too many open files") != -1 )
|
||||
if ( message.contains("Too many open files") )
|
||||
exitSystemWithUserError(new UserException.TooManyOpenFiles());
|
||||
|
||||
// malformed BAM looks like a SAM file
|
||||
if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 ||
|
||||
message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 )
|
||||
if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) ||
|
||||
message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
|
||||
exitSystemWithSamError(t);
|
||||
|
||||
// can't close tribble index when writing
|
||||
if ( message.indexOf("Unable to close index for") != -1 )
|
||||
if ( message.contains("Unable to close index for") )
|
||||
exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage()));
|
||||
|
||||
// disk is full
|
||||
if ( message.indexOf("No space left on device") != -1 )
|
||||
exitSystemWithUserError(new UserException(t.getMessage()));
|
||||
if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 )
|
||||
exitSystemWithUserError(new UserException(t.getCause().getMessage()));
|
||||
if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) )
|
||||
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
|
||||
if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) )
|
||||
exitSystemWithUserError(new UserException.NoSpaceOnDevice());
|
||||
|
||||
// masked out of memory error
|
||||
if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError )
|
||||
exitSystemWithUserError(new UserException.NotEnoughMemory());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -1,52 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
* Describes the method for downsampling reads at a given locus.
|
||||
*
|
||||
* @author hanna
|
||||
* @version 0.1
|
||||
*/
|
||||
|
||||
public class DownsamplingMethod {
|
||||
/**
|
||||
* Type of downsampling to perform.
|
||||
*/
|
||||
public final DownsampleType type;
|
||||
|
||||
/**
|
||||
* Actual downsampling target is specified as an integer number of reads.
|
||||
*/
|
||||
public final Integer toCoverage;
|
||||
|
||||
/**
|
||||
* Actual downsampling target is specified as a fraction of total available reads.
|
||||
*/
|
||||
public final Double toFraction;
|
||||
|
||||
/**
|
||||
* Expresses no downsampling applied at all.
|
||||
*/
|
||||
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null);
|
||||
|
||||
public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) {
|
||||
// Do some basic sanity checks on the downsampling parameters passed in.
|
||||
|
||||
// Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator.
|
||||
if(type != DownsampleType.NONE && toFraction == null && toCoverage == null)
|
||||
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
|
||||
|
||||
// Fraction and coverage cannot both be specified.
|
||||
if(toFraction != null && toCoverage != null)
|
||||
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
|
||||
|
||||
// Experimental by sample downsampling does not work with a fraction of reads.
|
||||
if(type == DownsampleType.BY_SAMPLE && toFraction != null)
|
||||
throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method");
|
||||
|
||||
this.type = type;
|
||||
this.toCoverage = toCoverage;
|
||||
this.toFraction = toFraction;
|
||||
}
|
||||
}
|
||||
|
|
@ -24,25 +24,28 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.*;
|
||||
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.executive.MicroScheduler;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
||||
|
|
@ -50,20 +53,16 @@ import org.broadinstitute.sting.gatk.samples.SampleDB;
|
|||
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -136,11 +135,18 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
private Collection<ReadFilter> filters;
|
||||
|
||||
/**
|
||||
* Collection of the read transformers applied to the reads
|
||||
*/
|
||||
private List<ReadTransformer> readTransformers;
|
||||
|
||||
/**
|
||||
* Controls the allocation of threads between CPU vs IO.
|
||||
*/
|
||||
private ThreadAllocation threadAllocation;
|
||||
|
||||
private ReadMetrics cumulativeMetrics = null;
|
||||
|
||||
/**
|
||||
* A currently hacky unique name for this GATK instance
|
||||
*/
|
||||
|
|
@ -175,6 +181,13 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
private Collection<RMDTriplet> referenceMetaDataFiles;
|
||||
|
||||
/**
|
||||
* The threading efficiency monitor we use in the GATK to monitor our efficiency.
|
||||
*
|
||||
* May be null if one isn't active, or hasn't be initialized yet
|
||||
*/
|
||||
private ThreadEfficiencyMonitor threadEfficiencyMonitor = null;
|
||||
|
||||
/**
|
||||
* Set the reference metadata files to use for this traversal.
|
||||
* @param referenceMetaDataFiles Collection of files and descriptors over which to traverse.
|
||||
|
|
@ -252,6 +265,7 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
// our microscheduler, which is in charge of running everything
|
||||
MicroScheduler microScheduler = createMicroscheduler();
|
||||
threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor();
|
||||
|
||||
// create temp directories as necessary
|
||||
initializeTempDirectory();
|
||||
|
|
@ -280,6 +294,8 @@ public class GenomeAnalysisEngine {
|
|||
static {
|
||||
deprecatedGATKWalkers.put("CountCovariates", "2.0");
|
||||
deprecatedGATKWalkers.put("TableRecalibration", "2.0");
|
||||
deprecatedGATKWalkers.put("AlignmentWalker", "2.2");
|
||||
deprecatedGATKWalkers.put("CountBestAlignments", "2.2");
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -349,32 +365,59 @@ public class GenomeAnalysisEngine {
|
|||
return Collections.unmodifiableList(filters);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a list of active, initialized read transformers
|
||||
*
|
||||
* @param walker the walker we need to apply read transformers too
|
||||
* @return a non-null list of read transformers
|
||||
*/
|
||||
public void initializeReadTransformers(final Walker walker) {
|
||||
final List<ReadTransformer> activeTransformers = new ArrayList<ReadTransformer>();
|
||||
|
||||
final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class);
|
||||
final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null;
|
||||
|
||||
final PluginManager<ReadTransformer> pluginManager = new PluginManager<ReadTransformer>(ReadTransformer.class);
|
||||
|
||||
for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) {
|
||||
transformer.initialize(overrideTime, this, walker);
|
||||
if ( transformer.enabled() )
|
||||
activeTransformers.add(transformer);
|
||||
}
|
||||
|
||||
setReadTransformers(activeTransformers);
|
||||
}
|
||||
|
||||
public List<ReadTransformer> getReadTransformers() {
|
||||
return readTransformers;
|
||||
}
|
||||
|
||||
private void setReadTransformers(final List<ReadTransformer> readTransformers) {
|
||||
if ( readTransformers == null )
|
||||
throw new ReviewedStingException("read transformers cannot be null");
|
||||
this.readTransformers = readTransformers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse out the thread allocation from the given command-line argument.
|
||||
*/
|
||||
private void determineThreadAllocation() {
|
||||
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads);
|
||||
if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads);
|
||||
if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread);
|
||||
if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads);
|
||||
|
||||
// TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters.
|
||||
Integer numCPUThreads = null;
|
||||
if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null)
|
||||
throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
|
||||
else if(tags.containsKey("cpu"))
|
||||
numCPUThreads = Integer.parseInt(tags.getValue("cpu"));
|
||||
else if(argCollection.numberOfCPUThreads != null)
|
||||
numCPUThreads = argCollection.numberOfCPUThreads;
|
||||
|
||||
Integer numIOThreads = null;
|
||||
if(tags.containsKey("io") && argCollection.numberOfIOThreads != null)
|
||||
throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
|
||||
else if(tags.containsKey("io"))
|
||||
numIOThreads = Integer.parseInt(tags.getValue("io"));
|
||||
else if(argCollection.numberOfIOThreads != null)
|
||||
numIOThreads = argCollection.numberOfIOThreads;
|
||||
|
||||
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
|
||||
this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads,
|
||||
argCollection.numberOfCPUThreadsPerDataThread,
|
||||
argCollection.numberOfIOThreads,
|
||||
argCollection.monitorThreadEfficiency);
|
||||
}
|
||||
|
||||
public int getTotalNumberOfThreads() {
|
||||
return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Allow subclasses and others within this package direct access to the walker manager.
|
||||
* @return The walker manager used by this package.
|
||||
|
|
@ -400,23 +443,19 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
protected DownsamplingMethod getDownsamplingMethod() {
|
||||
GATKArgumentCollection argCollection = this.getArguments();
|
||||
DownsamplingMethod method;
|
||||
if(argCollection.getDownsamplingMethod() != null)
|
||||
method = argCollection.getDownsamplingMethod();
|
||||
else if(WalkerManager.getDownsamplingMethod(walker) != null)
|
||||
method = WalkerManager.getDownsamplingMethod(walker);
|
||||
else
|
||||
method = GATKArgumentCollection.getDefaultDownsamplingMethod();
|
||||
return method;
|
||||
boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling;
|
||||
|
||||
DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod();
|
||||
DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling);
|
||||
DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling);
|
||||
|
||||
return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
|
||||
}
|
||||
|
||||
protected void setDownsamplingMethod(DownsamplingMethod method) {
|
||||
argCollection.setDownsamplingMethod(method);
|
||||
}
|
||||
|
||||
public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); }
|
||||
public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); }
|
||||
|
||||
protected boolean includeReadsWithDeletionAtLoci() {
|
||||
return walker.includeReadsWithDeletionAtLoci();
|
||||
}
|
||||
|
|
@ -504,6 +543,7 @@ public class GenomeAnalysisEngine {
|
|||
*/
|
||||
protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
|
||||
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
|
||||
DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null;
|
||||
ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
|
||||
|
||||
// If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
|
||||
|
|
@ -538,10 +578,15 @@ public class GenomeAnalysisEngine {
|
|||
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
|
||||
}
|
||||
|
||||
// Use the experimental ReadShardBalancer if experimental downsampling is enabled
|
||||
ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ?
|
||||
new ExperimentalReadShardBalancer() :
|
||||
new ReadShardBalancer();
|
||||
|
||||
if(intervals == null)
|
||||
return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer());
|
||||
return readsDataSource.createShardIteratorOverAllReads(readShardBalancer);
|
||||
else
|
||||
return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer());
|
||||
return readsDataSource.createShardIteratorOverIntervals(intervals, readShardBalancer);
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
|
||||
|
|
@ -639,14 +684,14 @@ public class GenomeAnalysisEngine {
|
|||
|
||||
// if include argument isn't given, create new set of all possible intervals
|
||||
|
||||
Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
|
||||
final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
|
||||
this.referenceDataSource,
|
||||
argCollection.intervals,
|
||||
argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
|
||||
argCollection.excludeIntervals);
|
||||
|
||||
GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
|
||||
GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
|
||||
final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
|
||||
final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
|
||||
|
||||
// if no exclude arguments, can return parseIntervalArguments directly
|
||||
if ( excludeSortedSet == null )
|
||||
|
|
@ -657,13 +702,15 @@ public class GenomeAnalysisEngine {
|
|||
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
|
||||
|
||||
// logging messages only printed when exclude (-XL) arguments are given
|
||||
long toPruneSize = includeSortedSet.coveredSize();
|
||||
long toExcludeSize = excludeSortedSet.coveredSize();
|
||||
long intervalSize = intervals.coveredSize();
|
||||
final long toPruneSize = includeSortedSet.coveredSize();
|
||||
final long toExcludeSize = excludeSortedSet.coveredSize();
|
||||
final long intervalSize = intervals.coveredSize();
|
||||
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
|
||||
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
|
||||
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
|
||||
}
|
||||
|
||||
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -697,13 +744,12 @@ public class GenomeAnalysisEngine {
|
|||
protected void initializeDataSources() {
|
||||
logger.info("Strictness is " + argCollection.strictnessLevel);
|
||||
|
||||
// TODO -- REMOVE ME
|
||||
BAQ.DEFAULT_GOP = argCollection.BAQGOP;
|
||||
|
||||
validateSuppliedReference();
|
||||
setReferenceDataSource(argCollection.referenceFile);
|
||||
|
||||
validateSuppliedReads();
|
||||
initializeReadTransformers(walker);
|
||||
|
||||
readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference());
|
||||
|
||||
for (ReadFilter filter : filters)
|
||||
|
|
@ -784,14 +830,13 @@ public class GenomeAnalysisEngine {
|
|||
* @return A data source for the given set of reads.
|
||||
*/
|
||||
private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
|
||||
DownsamplingMethod method = getDownsamplingMethod();
|
||||
DownsamplingMethod downsamplingMethod = getDownsamplingMethod();
|
||||
|
||||
// Synchronize the method back into the collection so that it shows up when
|
||||
// interrogating for the downsample method during command line recreation.
|
||||
setDownsamplingMethod(method);
|
||||
setDownsamplingMethod(downsamplingMethod);
|
||||
|
||||
if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF)
|
||||
throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested.");
|
||||
logger.info(downsamplingMethod);
|
||||
|
||||
if (argCollection.removeProgramRecords && argCollection.keepProgramRecords)
|
||||
throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options");
|
||||
|
|
@ -809,14 +854,11 @@ public class GenomeAnalysisEngine {
|
|||
argCollection.useOriginalBaseQualities,
|
||||
argCollection.strictnessLevel,
|
||||
argCollection.readBufferSize,
|
||||
method,
|
||||
downsamplingMethod,
|
||||
new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
|
||||
filters,
|
||||
readTransformers,
|
||||
includeReadsWithDeletionAtLoci(),
|
||||
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
|
||||
getWalkerBAQQualityMode(),
|
||||
refReader,
|
||||
getBaseRecalibration(),
|
||||
argCollection.defaultBaseQualities,
|
||||
removeProgramRecords);
|
||||
}
|
||||
|
|
@ -943,6 +985,22 @@ public class GenomeAnalysisEngine {
|
|||
return this.intervals;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of regions of the genome being processed. If the user
|
||||
* requested specific intervals, return those, otherwise return regions
|
||||
* corresponding to the entire genome. Never returns null.
|
||||
*
|
||||
* @return a non-null set of intervals being processed
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() {
|
||||
if ( getIntervals() == null )
|
||||
// if we don't have any intervals defined, create intervals from the reference itself
|
||||
return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary());
|
||||
else
|
||||
return getIntervals();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the list of filters employed by this engine.
|
||||
* @return Collection of filters (actual instances) used by this engine.
|
||||
|
|
@ -1000,7 +1058,19 @@ public class GenomeAnalysisEngine {
|
|||
* owned by the caller; the caller can do with the object what they wish.
|
||||
*/
|
||||
public ReadMetrics getCumulativeMetrics() {
|
||||
return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics();
|
||||
// todo -- probably shouldn't be lazy
|
||||
if ( cumulativeMetrics == null )
|
||||
cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics();
|
||||
return cumulativeMetrics;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the global ThreadEfficiencyMonitor, if there is one
|
||||
*
|
||||
* @return the monitor, or null if none is active
|
||||
*/
|
||||
public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() {
|
||||
return threadEfficiencyMonitor;
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
|
|
|||
|
|
@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk;
|
|||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
|
@ -119,11 +118,18 @@ public class ReadMetrics implements Cloneable {
|
|||
return nRecords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
|
||||
*/
|
||||
public void incrementNumIterations(final long by) {
|
||||
nRecords += by;
|
||||
}
|
||||
|
||||
/**
|
||||
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
|
||||
*/
|
||||
public void incrementNumIterations() {
|
||||
nRecords++;
|
||||
incrementNumIterations(1);
|
||||
}
|
||||
|
||||
public long getNumReadsSeen() {
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileReader;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
/**
|
||||
* User: hanna
|
||||
* Date: May 14, 2009
|
||||
|
|
@ -30,16 +30,14 @@ import java.util.Collection;
|
|||
public class ReadProperties {
|
||||
private final Collection<SAMReaderID> readers;
|
||||
private final SAMFileHeader header;
|
||||
private final SAMFileHeader.SortOrder sortOrder;
|
||||
private final SAMFileReader.ValidationStringency validationStringency;
|
||||
private final DownsamplingMethod downsamplingMethod;
|
||||
private final ValidationExclusion exclusionList;
|
||||
private final Collection<ReadFilter> supplementalFilters;
|
||||
private final List<ReadTransformer> readTransformers;
|
||||
private final boolean includeReadsWithDeletionAtLoci;
|
||||
private final boolean useOriginalBaseQualities;
|
||||
private final BAQ.CalculationMode cmode;
|
||||
private final BAQ.QualityMode qmode;
|
||||
private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
|
||||
private final BaseRecalibration bqsrApplier;
|
||||
private final byte defaultBaseQualities;
|
||||
|
||||
/**
|
||||
|
|
@ -67,6 +65,14 @@ public class ReadProperties {
|
|||
return header;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the sort order of the reads
|
||||
* @return the sort order of the reads
|
||||
*/
|
||||
public SAMFileHeader.SortOrder getSortOrder() {
|
||||
return sortOrder;
|
||||
}
|
||||
|
||||
/**
|
||||
* How strict should validation be?
|
||||
* @return Stringency of validation.
|
||||
|
|
@ -95,6 +101,11 @@ public class ReadProperties {
|
|||
return supplementalFilters;
|
||||
}
|
||||
|
||||
|
||||
public List<ReadTransformer> getReadTransformers() {
|
||||
return readTransformers;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether to use original base qualities.
|
||||
* @return Whether to use original base qualities.
|
||||
|
|
@ -103,16 +114,6 @@ public class ReadProperties {
|
|||
return useOriginalBaseQualities;
|
||||
}
|
||||
|
||||
|
||||
public BAQ.QualityMode getBAQQualityMode() { return qmode; }
|
||||
public BAQ.CalculationMode getBAQCalculationMode() { return cmode; }
|
||||
|
||||
public IndexedFastaSequenceFile getRefReader() {
|
||||
return refReader;
|
||||
}
|
||||
|
||||
public BaseRecalibration getBQSRApplier() { return bqsrApplier; }
|
||||
|
||||
/**
|
||||
* @return Default base quality value to fill reads missing base quality information.
|
||||
*/
|
||||
|
|
@ -134,36 +135,29 @@ public class ReadProperties {
|
|||
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
|
||||
* will explicitly list reads with deletion over the current reference base; otherwise, only observed
|
||||
* bases will be seen in the pileups, and the deletions will be skipped silently.
|
||||
* @param cmode How should we apply the BAQ calculation to the reads?
|
||||
* @param qmode How should we apply the BAQ calculation to the reads?
|
||||
* @param refReader if applyBAQ is true, must be a valid pointer to a indexed fasta file reads so we can get the ref bases for BAQ calculation
|
||||
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
|
||||
*/
|
||||
public ReadProperties( Collection<SAMReaderID> samFiles,
|
||||
SAMFileHeader header,
|
||||
SAMFileHeader.SortOrder sortOrder,
|
||||
boolean useOriginalBaseQualities,
|
||||
SAMFileReader.ValidationStringency strictness,
|
||||
DownsamplingMethod downsamplingMethod,
|
||||
ValidationExclusion exclusionList,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
List<ReadTransformer> readTransformers,
|
||||
boolean includeReadsWithDeletionAtLoci,
|
||||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
byte defaultBaseQualities) {
|
||||
this.readers = samFiles;
|
||||
this.header = header;
|
||||
this.sortOrder = sortOrder;
|
||||
this.validationStringency = strictness;
|
||||
this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod;
|
||||
this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
|
||||
this.supplementalFilters = supplementalFilters;
|
||||
this.readTransformers = readTransformers;
|
||||
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
|
||||
this.useOriginalBaseQualities = useOriginalBaseQualities;
|
||||
this.cmode = cmode;
|
||||
this.qmode = qmode;
|
||||
this.refReader = refReader;
|
||||
this.bqsrApplier = bqsrApplier;
|
||||
this.defaultBaseQualities = defaultBaseQualities;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,15 +27,18 @@ package org.broadinstitute.sting.gatk;
|
|||
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.filters.FilterManager;
|
||||
import org.broadinstitute.sting.gatk.filters.ReadFilter;
|
||||
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
|
||||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
|
||||
import java.lang.annotation.Annotation;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -303,9 +306,10 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walkerClass The class of the walker to interrogate.
|
||||
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass) {
|
||||
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useExperimentalDownsampling) {
|
||||
DownsamplingMethod downsamplingMethod = null;
|
||||
|
||||
if( walkerClass.isAnnotationPresent(Downsample.class) ) {
|
||||
|
|
@ -313,17 +317,17 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
DownsampleType type = downsampleParameters.by();
|
||||
Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
|
||||
Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
|
||||
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction);
|
||||
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling);
|
||||
}
|
||||
|
||||
return downsamplingMethod;
|
||||
}
|
||||
|
||||
public static BAQ.QualityMode getBAQQualityMode(Walker walker) {
|
||||
return walker.getClass().getAnnotation(BAQMode.class).QualityMode();
|
||||
public static <T extends Annotation> T getWalkerAnnotation(final Walker walker, final Class<T> clazz) {
|
||||
return walker.getClass().getAnnotation(clazz);
|
||||
}
|
||||
|
||||
public static BAQ.ApplicationTime getBAQApplicationTime(Walker walker) {
|
||||
public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) {
|
||||
return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime();
|
||||
}
|
||||
|
||||
|
|
@ -332,10 +336,11 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
* downsampling method is specified on the command-line, the command-line version will
|
||||
* be used instead.
|
||||
* @param walker The walker to interrogate.
|
||||
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
|
||||
* @return The downsampling method, as specified by the walker. Null if none exists.
|
||||
*/
|
||||
public static DownsamplingMethod getDownsamplingMethod(Walker walker) {
|
||||
return getDownsamplingMethod(walker.getClass());
|
||||
public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) {
|
||||
return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -31,8 +31,8 @@ import org.broadinstitute.sting.commandline.Argument;
|
|||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.IntervalBinding;
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
|
||||
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
|
|
@ -41,7 +41,9 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
|||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @author aaron
|
||||
|
|
@ -64,12 +66,35 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
|
||||
public Integer readBufferSize = null;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// GATKRunReport options
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
|
||||
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
|
||||
|
||||
@Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
|
||||
public File gatkKeyFile = null;
|
||||
|
||||
/**
|
||||
* The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary String tag that can be
|
||||
* used to group together runs during later analysis. One use of this capability is to tag runs as GATK
|
||||
* performance tests, so that the performance of the GATK over time can be assessed from the logs directly.
|
||||
*
|
||||
* Note that the tags do not conform to any ontology, so you are free to use any tags that you might find
|
||||
* meaningful.
|
||||
*/
|
||||
@Argument(fullName = "tag", shortName = "tag", doc="Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis", required = false)
|
||||
public String tag = "NA";
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// XXX
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
|
||||
public List<String> readFilters = new ArrayList<String>();
|
||||
|
||||
|
|
@ -115,15 +140,14 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
|
||||
public boolean nonDeterministicRandomSeed = false;
|
||||
|
||||
/**
|
||||
* The override mechanism in the GATK, by default, populates the command-line arguments, then
|
||||
* the defaults from the walker annotations. Unfortunately, walker annotations should be trumped
|
||||
* by a user explicitly specifying command-line arguments.
|
||||
* TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments.
|
||||
*/
|
||||
private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
|
||||
private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000;
|
||||
@Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
|
||||
public boolean disableRandomization = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Downsampling Arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
|
||||
public DownsampleType downsamplingType = null;
|
||||
|
||||
|
|
@ -133,17 +157,20 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
|
||||
public Integer downsampleCoverage = null;
|
||||
|
||||
@Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false)
|
||||
@Hidden
|
||||
public boolean enableExperimentalDownsampling = false;
|
||||
|
||||
/**
|
||||
* Gets the downsampling method explicitly specified by the user. If the user didn't specify
|
||||
* a default downsampling mechanism, return the default.
|
||||
* @return The explicitly specified downsampling mechanism, or the default if none exists.
|
||||
*/
|
||||
public DownsamplingMethod getDownsamplingMethod() {
|
||||
if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null)
|
||||
if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null )
|
||||
return null;
|
||||
if(downsamplingType == null && downsampleCoverage != null)
|
||||
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null);
|
||||
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
|
||||
|
||||
return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -153,9 +180,11 @@ public class GATKArgumentCollection {
|
|||
public void setDownsamplingMethod(DownsamplingMethod method) {
|
||||
if (method == null)
|
||||
throw new IllegalArgumentException("method is null");
|
||||
|
||||
downsamplingType = method.type;
|
||||
downsampleCoverage = method.toCoverage;
|
||||
downsampleFraction = method.toFraction;
|
||||
enableExperimentalDownsampling = method.useExperimentalDownsampling;
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
|
@ -174,17 +203,14 @@ public class GATKArgumentCollection {
|
|||
// performance log arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
|
||||
public File performanceLog = null;
|
||||
|
||||
/**
|
||||
* Gets the default downsampling method, returned if the user didn't specify any downsampling
|
||||
* method.
|
||||
* @return The default downsampling mechanism, or null if none exists.
|
||||
* The file name for the GATK performance log output, or null if you don't want to generate the
|
||||
* detailed performance logging table. This table is suitable for importing into R or any
|
||||
* other analysis software that can read tsv files
|
||||
*/
|
||||
public static DownsamplingMethod getDefaultDownsamplingMethod() {
|
||||
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null);
|
||||
}
|
||||
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
|
||||
public File performanceLog = null;
|
||||
|
||||
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
|
||||
public Boolean useOriginalBaseQualities = false;
|
||||
|
|
@ -256,20 +282,40 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
|
||||
public ValidationExclusion.TYPE unsafe;
|
||||
|
||||
/** How many threads should be allocated to this analysis. */
|
||||
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false)
|
||||
public Integer numberOfThreads = 1;
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Multi-threading arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types.
|
||||
* TODO: Kill this when I can do a tagged integer in Queue.
|
||||
* How many data threads should be allocated to this analysis? Data threads contains N cpu threads per
|
||||
* data thread, and act as completely data parallel processing, increasing the memory usage of GATK
|
||||
* by M data threads. Data threads generally scale extremely effectively, up to 24 cores
|
||||
*/
|
||||
@Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false)
|
||||
@Hidden
|
||||
public Integer numberOfCPUThreads = null;
|
||||
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false)
|
||||
public Integer numberOfDataThreads = 1;
|
||||
|
||||
/**
|
||||
* How many CPU threads should be allocated per data thread? Each CPU thread operates the map
|
||||
* cycle independently, but may run into earlier scaling problems with IO than data threads. Has
|
||||
* the benefit of not requiring X times as much memory per thread as data threads do, but rather
|
||||
* only a constant overhead.
|
||||
*/
|
||||
@Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false)
|
||||
public int numberOfCPUThreadsPerDataThread = 1;
|
||||
|
||||
@Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
|
||||
@Hidden
|
||||
public Integer numberOfIOThreads = null;
|
||||
public int numberOfIOThreads = 0;
|
||||
|
||||
/**
|
||||
* Enable GATK to monitor its own threading efficiency, at a itsy-bitsy tiny
|
||||
* cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for
|
||||
* debugging purposes.
|
||||
*/
|
||||
@Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable GATK threading efficiency monitoring", required = false)
|
||||
public Boolean monitorThreadEfficiency = false;
|
||||
|
||||
@Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
|
||||
public Integer numberOfBAMFileHandles = null;
|
||||
|
|
|
|||
|
|
@ -1,13 +1,12 @@
|
|||
package org.broadinstitute.sting.gatk.arguments;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Advanced;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
/**
|
||||
* Created with IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
|
|
@ -59,4 +58,18 @@ public class StandardCallerArgumentCollection {
|
|||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES = 3;
|
||||
|
||||
/**
|
||||
* If there are more than this number of alternate alleles presented to the genotyper (either through discovery or GENOTYPE_GIVEN ALLELES),
|
||||
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
|
||||
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
|
||||
* that you not play around with this parameter.
|
||||
*/
|
||||
@Advanced
|
||||
@Argument(fullName = "max_alternate_alleles_for_indels", shortName = "maxAltAllelesForIndels", doc = "Maximum number of alternate alleles to genotype for indels only", required = false)
|
||||
public int MAX_ALTERNATE_ALLELES_FOR_INDELS = 2;
|
||||
|
||||
@Hidden
|
||||
@Argument(shortName = "logExactCalls", doc="x", required=false)
|
||||
public File exactCallsLog = null;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -177,7 +177,7 @@ public class ReferenceContext {
|
|||
* @return The base at the given locus from the reference.
|
||||
*/
|
||||
public byte getBase() {
|
||||
return getBases()[(int)(locus.getStart() - window.getStart())];
|
||||
return getBases()[(locus.getStart() - window.getStart())];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,143 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedList;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/**
|
||||
* Key algorithmic helper for ReadBasedReferenceOrderedData
|
||||
*
|
||||
* Takes a single iterator of features, and provides a single capability that returns
|
||||
* the list of RODs that overlap an interval. Allows sequential getOverlapping calls
|
||||
* from intervals provided that these intervals always have increasing getStart() values.
|
||||
*
|
||||
*/
|
||||
class IntervalOverlappingRODsFromStream {
|
||||
/**
|
||||
* Only held for QC purposes
|
||||
*/
|
||||
GenomeLoc lastQuery = null;
|
||||
|
||||
private final String name;
|
||||
private final LinkedList<GATKFeature> currentFeatures = new LinkedList<GATKFeature>();
|
||||
private final PeekableIterator<RODRecordList> futureFeatures;
|
||||
|
||||
/**
|
||||
* Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and
|
||||
* returns RODRecordLists having name
|
||||
*
|
||||
* @param name
|
||||
* @param futureFeatures
|
||||
*/
|
||||
IntervalOverlappingRODsFromStream(final String name, final PeekableIterator<RODRecordList> futureFeatures) {
|
||||
if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null");
|
||||
|
||||
this.name = name;
|
||||
this.futureFeatures = futureFeatures;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the list of RODs overlapping loc from this stream of RODs.
|
||||
*
|
||||
* Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart
|
||||
*
|
||||
* @param loc the interval to query
|
||||
* @return a non-null RODRecordList containing the overlapping RODs, which may be empty
|
||||
*/
|
||||
@Ensures({"overlaps(loc, result)",
|
||||
"! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)",
|
||||
"result != null"})
|
||||
public RODRecordList getOverlapping(final GenomeLoc loc) {
|
||||
if ( lastQuery != null && loc.getStart() < lastQuery.getStart() )
|
||||
throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery));
|
||||
|
||||
trimCurrentFeaturesToLoc(loc);
|
||||
readOverlappingFutureFeatures(loc);
|
||||
return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* For contract assurance. Checks that all bindings in loc overlap
|
||||
*
|
||||
* @param loc
|
||||
* @param bindings
|
||||
* @return
|
||||
*/
|
||||
@Requires({"loc != null", "bindings != null"})
|
||||
private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) {
|
||||
for ( final GATKFeature feature : bindings )
|
||||
if ( ! feature.getLocation().overlapsP(loc) )
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subset the features in all to those that overlap with loc
|
||||
*
|
||||
* The current features list contains everything read that cannot be thrown away yet, but not
|
||||
* everything in there necessarily overlaps with loc. Subset to just those that do overlap
|
||||
*
|
||||
* @param loc the location that features must overlap
|
||||
* @param all the list of all features
|
||||
* @return a subset of all that overlaps with loc
|
||||
*/
|
||||
@Requires({"loc != null", "all != null"})
|
||||
@Ensures("result.size() <= all.size()")
|
||||
private Collection<GATKFeature> subsetToOverlapping(final GenomeLoc loc, final Collection<GATKFeature> all) {
|
||||
final LinkedList<GATKFeature> overlapping = new LinkedList<GATKFeature>();
|
||||
for ( final GATKFeature feature : all )
|
||||
if ( feature.getLocation().overlapsP(loc) )
|
||||
overlapping.add(feature);
|
||||
return overlapping;
|
||||
}
|
||||
|
||||
/**
|
||||
* Update function. Remove all elements of currentFeatures that end before loc
|
||||
*
|
||||
* @param loc the location to use
|
||||
*/
|
||||
@Requires("loc != null")
|
||||
@Ensures("currentFeatures.size() <= old(currentFeatures.size())")
|
||||
private void trimCurrentFeaturesToLoc(final GenomeLoc loc) {
|
||||
final ListIterator<GATKFeature> it = currentFeatures.listIterator();
|
||||
while ( it.hasNext() ) {
|
||||
final GATKFeature feature = it.next();
|
||||
if ( feature.getLocation().isBefore(loc) )
|
||||
it.remove();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Update function: Read all elements from futureFeatures that overlap with loc
|
||||
*
|
||||
* Stops at the first element that starts before the end of loc, or the stream empties
|
||||
*
|
||||
* @param loc
|
||||
*/
|
||||
@Requires("loc != null")
|
||||
@Ensures("currentFeatures.size() >= old(currentFeatures.size())")
|
||||
private void readOverlappingFutureFeatures(final GenomeLoc loc) {
|
||||
while ( futureFeatures.hasNext() ) {
|
||||
final GenomeLoc nextLoc = futureFeatures.peek().getLocation();
|
||||
if ( nextLoc.isBefore(loc) ) {
|
||||
futureFeatures.next(); // next rod element is before loc, throw it away and keep looking
|
||||
} else if ( nextLoc.isPast(loc) ) {
|
||||
break; // next element is past loc, stop looking but don't pop it
|
||||
} else if ( nextLoc.overlapsP(loc) ) {
|
||||
// add overlapping elements to our current features, removing from stream
|
||||
for ( final GATKFeature feature : futureFeatures.next() ) {
|
||||
currentFeatures.add(feature);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import org.broadinstitute.sting.gatk.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.iterators.LocusIterator;
|
||||
|
|
@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View {
|
|||
|
||||
// Cache the current and apply filtering.
|
||||
AlignmentContext current = nextLocus;
|
||||
if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null )
|
||||
|
||||
// The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling:
|
||||
if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling &&
|
||||
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) {
|
||||
|
||||
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
|
||||
}
|
||||
|
||||
// Indicate that the next operation will need to advance.
|
||||
nextLocus = null;
|
||||
|
|
|
|||
|
|
@ -58,7 +58,7 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView {
|
|||
// todo -- warning, I removed the reference to the name from states
|
||||
bindings.add( state.iterator.seekForward(loc) );
|
||||
|
||||
return new RefMetaDataTracker(bindings, referenceContext);
|
||||
return new RefMetaDataTracker(bindings);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -23,40 +23,63 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.datasources.providers;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.datasources.reads.ReadShard;
|
||||
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.TreeMap;
|
||||
|
||||
/** a ROD view for reads. This provides the Read traversals a way of getting a ReadMetaDataTracker */
|
||||
/** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */
|
||||
public class ReadBasedReferenceOrderedView implements View {
|
||||
private final WindowedData window;
|
||||
|
||||
public ReadBasedReferenceOrderedView(ShardDataProvider provider) {
|
||||
window = new WindowedData(provider);
|
||||
provider.register(this);
|
||||
}
|
||||
// a list of the RMDDataState (location->iterators)
|
||||
private final List<RMDDataState> states = new ArrayList<RMDDataState>(1);
|
||||
private final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker();
|
||||
|
||||
/**
|
||||
* for testing only please
|
||||
*
|
||||
* @param data the window provider
|
||||
* Used to get genome locs for reads
|
||||
*/
|
||||
ReadBasedReferenceOrderedView(WindowedData data) {
|
||||
window = data;
|
||||
private final GenomeLocParser genomeLocParser;
|
||||
|
||||
/**
|
||||
* The total extent of all reads in this span. We create iterators from our RODs
|
||||
* from the start of this span, to the end.
|
||||
*/
|
||||
private final GenomeLoc shardSpan;
|
||||
|
||||
public ReadBasedReferenceOrderedView(final ShardDataProvider provider) {
|
||||
this.genomeLocParser = provider.getGenomeLocParser();
|
||||
// conditional to optimize the case where we don't have any ROD data
|
||||
this.shardSpan = provider.getReferenceOrderedData() != null ? ((ReadShard)provider.getShard()).getReadsSpan() : null;
|
||||
provider.register(this);
|
||||
|
||||
if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) {
|
||||
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
|
||||
states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan)));
|
||||
}
|
||||
}
|
||||
|
||||
public ReadMetaDataTracker getReferenceOrderedDataForRead(SAMRecord read) {
|
||||
return window.getTracker(read);
|
||||
|
||||
/**
|
||||
* Testing constructor
|
||||
*/
|
||||
protected ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser,
|
||||
final GenomeLoc shardSpan,
|
||||
final List<String> names,
|
||||
final List<PeekableIterator<RODRecordList>> featureSources) {
|
||||
this.genomeLocParser = genomeLocParser;
|
||||
this.shardSpan = shardSpan;
|
||||
for ( int i = 0; i < names.size(); i++ )
|
||||
states.add(new RMDDataState(names.get(i), featureSources.get(i)));
|
||||
}
|
||||
|
||||
public Collection<Class<? extends View>> getConflictingViews() {
|
||||
|
|
@ -65,135 +88,72 @@ public class ReadBasedReferenceOrderedView implements View {
|
|||
return classes;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (window != null) window.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** stores a window of data, dropping RODs if we've passed the new reads start point. */
|
||||
class WindowedData {
|
||||
// the queue of possibly in-frame RODs; RODs are removed as soon as they are out of scope
|
||||
private final TreeMap<Integer, RODMetaDataContainer> mapping = new TreeMap<Integer, RODMetaDataContainer>();
|
||||
|
||||
// our current location from the last read we processed
|
||||
private GenomeLoc currentLoc;
|
||||
|
||||
// a list of the RMDDataState (location->iterators)
|
||||
private List<RMDDataState> states;
|
||||
|
||||
// the provider; where we get all our information
|
||||
private final ShardDataProvider provider;
|
||||
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(WindowedData.class);
|
||||
|
||||
/**
|
||||
* create a WindowedData given a shard provider
|
||||
*
|
||||
* @param provider the ShardDataProvider
|
||||
*/
|
||||
public WindowedData(ShardDataProvider provider) {
|
||||
this.provider = provider;
|
||||
}
|
||||
|
||||
/**
|
||||
* load the states dynamically, since the only way to get a genome loc is from the read (the shard doesn't have one)
|
||||
*
|
||||
* @param provider the ShardDataProvider
|
||||
* @param rec the current read
|
||||
*/
|
||||
private void getStates(ShardDataProvider provider, SAMRecord rec) {
|
||||
|
||||
int stop = Integer.MAX_VALUE;
|
||||
// figure out the appropriate alignment stop
|
||||
if (provider.hasReference()) {
|
||||
stop = provider.getReference().getSequenceDictionary().getSequence(rec.getReferenceIndex()).getSequenceLength();
|
||||
}
|
||||
|
||||
// calculate the range of positions we need to look at
|
||||
GenomeLoc range = provider.getGenomeLocParser().createGenomeLoc(rec.getReferenceName(),
|
||||
rec.getAlignmentStart(),
|
||||
stop);
|
||||
states = new ArrayList<RMDDataState>();
|
||||
if (provider.getReferenceOrderedData() != null)
|
||||
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
|
||||
states.add(new RMDDataState(dataSource, dataSource.seek(range)));
|
||||
}
|
||||
|
||||
/**
|
||||
* this function is for testing only
|
||||
*
|
||||
* @param states a list of RMDDataState to initialize with
|
||||
*/
|
||||
WindowedData(List<RMDDataState> states) {
|
||||
this.states = states;
|
||||
provider = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* create a ReadMetaDataTracker given the current read
|
||||
* create a RefMetaDataTracker given the current read
|
||||
*
|
||||
* @param rec the read
|
||||
*
|
||||
* @return a ReadMetaDataTracker for the read, from which you can get ROD -> read alignments
|
||||
* @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments
|
||||
*/
|
||||
public ReadMetaDataTracker getTracker(SAMRecord rec) {
|
||||
updatePosition(rec);
|
||||
return new ReadMetaDataTracker(provider.getGenomeLocParser(), rec, mapping);
|
||||
@Requires("rec != null")
|
||||
@Ensures("result != null")
|
||||
public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) {
|
||||
if ( rec.getReadUnmappedFlag() )
|
||||
// empty RODs for unmapped reads
|
||||
return new RefMetaDataTracker();
|
||||
else
|
||||
return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec));
|
||||
}
|
||||
|
||||
/**
|
||||
* update the position we're storing
|
||||
*
|
||||
* @param rec the read to use for start and end
|
||||
*/
|
||||
private void updatePosition(SAMRecord rec) {
|
||||
if (states == null) getStates(this.provider, rec);
|
||||
currentLoc = provider.getGenomeLocParser().createGenomeLoc(rec);
|
||||
|
||||
// flush the queue looking for records we've passed over
|
||||
while (mapping.size() > 0 && mapping.firstKey() < currentLoc.getStart())
|
||||
mapping.pollFirstEntry(); // toss away records that we've passed
|
||||
|
||||
// add new data to the queue
|
||||
for (RMDDataState state : states) {
|
||||
// move into position
|
||||
while (state.iterator.hasNext() && state.iterator.peekNextLocation().isBefore(currentLoc))
|
||||
state.iterator.next();
|
||||
while (state.iterator.hasNext() && state.iterator.peekNextLocation().overlapsP(currentLoc)) {
|
||||
RODRecordList list = state.iterator.next();
|
||||
for (GATKFeature datum : list) {
|
||||
if (!mapping.containsKey(list.getLocation().getStart()))
|
||||
mapping.put(list.getLocation().getStart(), new RODMetaDataContainer());
|
||||
mapping.get(list.getLocation().getStart()).addEntry(datum);
|
||||
}
|
||||
}
|
||||
@Requires({"interval != null", "shardSpan == null || shardSpan.isUnmapped() || shardSpan.containsP(interval)"})
|
||||
@Ensures("result != null")
|
||||
public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) {
|
||||
if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers)
|
||||
return EMPTY_TRACKER;
|
||||
else {
|
||||
final List<RODRecordList> bindings = new ArrayList<RODRecordList>(states.size());
|
||||
for ( final RMDDataState state : states )
|
||||
bindings.add(state.stream.getOverlapping(interval));
|
||||
return new RefMetaDataTracker(bindings);
|
||||
}
|
||||
}
|
||||
|
||||
/** Closes the current view. */
|
||||
/**
|
||||
* Closes the current view.
|
||||
*/
|
||||
public void close() {
|
||||
if (states == null) return;
|
||||
for (RMDDataState state : states)
|
||||
state.dataSource.close( state.iterator );
|
||||
for (final RMDDataState state : states)
|
||||
state.close();
|
||||
|
||||
// Clear out the existing data so that post-close() accesses to this data will fail-fast.
|
||||
states = null;
|
||||
states.clear();
|
||||
}
|
||||
|
||||
/** Models the traversal state of a given ROD lane. */
|
||||
private static class RMDDataState {
|
||||
public final ReferenceOrderedDataSource dataSource;
|
||||
public final IntervalOverlappingRODsFromStream stream;
|
||||
private final LocationAwareSeekableRODIterator iterator;
|
||||
|
||||
}
|
||||
public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
|
||||
this.dataSource = dataSource;
|
||||
this.iterator = iterator;
|
||||
this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<RODRecordList>(iterator));
|
||||
}
|
||||
|
||||
/** Models the traversal state of a given ROD lane. */
|
||||
class RMDDataState {
|
||||
public final ReferenceOrderedDataSource dataSource;
|
||||
public final LocationAwareSeekableRODIterator iterator;
|
||||
/**
|
||||
* For testing
|
||||
*/
|
||||
public RMDDataState(final String name, final PeekableIterator<RODRecordList> iterator) {
|
||||
this.dataSource = null;
|
||||
this.iterator = null;
|
||||
this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<RODRecordList>(iterator));
|
||||
}
|
||||
|
||||
public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
|
||||
this.dataSource = dataSource;
|
||||
this.iterator = iterator;
|
||||
public void close() {
|
||||
if ( dataSource != null )
|
||||
dataSource.close( iterator );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -59,16 +59,18 @@ public class ReadReferenceView extends ReferenceView {
|
|||
}
|
||||
|
||||
public byte[] getBases() {
|
||||
// System.out.printf("Getting bases for location %s%n", loc);
|
||||
// throw new StingException("x");
|
||||
return getReferenceBases(loc);
|
||||
}
|
||||
}
|
||||
|
||||
public ReferenceContext getReferenceContext( SAMRecord read ) {
|
||||
/**
|
||||
* Return a reference context appropriate for the span of read
|
||||
*
|
||||
* @param read the mapped read to test
|
||||
* @return
|
||||
*/
|
||||
public ReferenceContext getReferenceContext( final SAMRecord read ) {
|
||||
GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
|
||||
// byte[] bases = super.getReferenceBases(loc);
|
||||
// return new ReferenceContext( loc, loc, bases );
|
||||
return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) );
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -101,7 +101,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
|
|||
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) {
|
||||
// special case the interval again -- add it into the ROD
|
||||
if ( interval != null ) { allTracksHere.add(interval); }
|
||||
return new RefMetaDataTracker(allTracksHere, referenceContext);
|
||||
return new RefMetaDataTracker(allTracksHere);
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
|
|
|
|||
|
|
@ -94,6 +94,13 @@ public abstract class ShardDataProvider {
|
|||
return referenceOrderedData;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if reference ordered data will be provided by this shard
|
||||
*/
|
||||
public boolean hasReferenceOrderedData() {
|
||||
return ! getReferenceOrderedData().isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a data provider for the shard given the reads and reference.
|
||||
* @param shard The chunk of data over which traversals happen.
|
||||
|
|
|
|||
|
|
@ -124,7 +124,24 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
*/
|
||||
private FilePointer generatePointerOverEntireFileset() {
|
||||
FilePointer filePointer = new FilePointer();
|
||||
Map<SAMReaderID,GATKBAMFileSpan> currentPosition = dataSource.getCurrentPosition();
|
||||
|
||||
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
|
||||
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from
|
||||
// multiple contigs
|
||||
filePointer.setIsMonolithic(true);
|
||||
|
||||
Map<SAMReaderID,GATKBAMFileSpan> currentPosition;
|
||||
|
||||
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling
|
||||
// TODO: clean this up once the experimental downsampling engine fork collapses
|
||||
if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) {
|
||||
currentPosition = dataSource.getInitialReaderPositions();
|
||||
}
|
||||
else {
|
||||
currentPosition = dataSource.getCurrentPosition();
|
||||
|
||||
}
|
||||
|
||||
for(SAMReaderID reader: dataSource.getReaderIDs())
|
||||
filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
|
||||
return filePointer;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,228 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards.
|
||||
*
|
||||
* When processing FilePointers, our strategy is to aggregate all FilePointers for each contig
|
||||
* together into one monolithic FilePointer, create one persistent set of read iterators over
|
||||
* that monolithic FilePointer, and repeatedly use that persistent set of read iterators to
|
||||
* fill read shards with reads.
|
||||
*
|
||||
* This strategy has several important advantages:
|
||||
*
|
||||
* 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole
|
||||
* contig will have regions that overlap with other FilePointers on the same contig, due
|
||||
* to the limited granularity of BAM index data. By creating only one FilePointer per contig,
|
||||
* we avoid having to track how much of each file region we've visited (as we did in the
|
||||
* former implementation), we avoid expensive non-sequential access patterns in the files,
|
||||
* and we avoid having to repeatedly re-create our iterator chain for every small region
|
||||
* of interest.
|
||||
*
|
||||
* 2. We avoid boundary issues with the engine-level downsampling. Since we create a single
|
||||
* persistent set of read iterators (which include the downsampling iterator(s)) per contig,
|
||||
* the downsampling process is never interrupted by FilePointer or Shard boundaries, and never
|
||||
* loses crucial state information while downsampling within a contig.
|
||||
*
|
||||
* TODO: There is also at least one important disadvantage:
|
||||
*
|
||||
* 1. We load more BAM index data into memory at once, and this work is done upfront before processing
|
||||
* the next contig, creating a delay before traversal of each contig. This delay may be
|
||||
* compensated for by the gains listed in #1 above, and we may be no worse off overall in
|
||||
* terms of total runtime, but we need to verify this empirically.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ExperimentalReadShardBalancer extends ShardBalancer {
|
||||
|
||||
private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class);
|
||||
|
||||
/**
|
||||
* Convert iterators of file pointers into balanced iterators of shards.
|
||||
* @return An iterator over balanced shards.
|
||||
*/
|
||||
public Iterator<Shard> iterator() {
|
||||
return new Iterator<Shard>() {
|
||||
/**
|
||||
* The cached shard to be returned next. Prefetched in the peekable iterator style.
|
||||
*/
|
||||
private Shard nextShard = null;
|
||||
|
||||
/**
|
||||
* The file pointer currently being processed.
|
||||
*/
|
||||
private FilePointer currentContigFilePointer = null;
|
||||
|
||||
/**
|
||||
* Iterator over the reads from the current contig's file pointer. The same iterator will be
|
||||
* used to fill all shards associated with a given file pointer
|
||||
*/
|
||||
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
|
||||
|
||||
/**
|
||||
* How many FilePointers have we pulled from the filePointers iterator?
|
||||
*/
|
||||
private int totalFilePointersConsumed = 0;
|
||||
|
||||
/**
|
||||
* Have we encountered a monolithic FilePointer?
|
||||
*/
|
||||
private boolean encounteredMonolithicFilePointer = false;
|
||||
|
||||
|
||||
{
|
||||
createNextContigFilePointer();
|
||||
advance();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextShard != null;
|
||||
}
|
||||
|
||||
public Shard next() {
|
||||
if ( ! hasNext() )
|
||||
throw new NoSuchElementException("No next read shard available");
|
||||
Shard currentShard = nextShard;
|
||||
advance();
|
||||
return currentShard;
|
||||
}
|
||||
|
||||
private void advance() {
|
||||
nextShard = null;
|
||||
|
||||
// May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away
|
||||
while ( nextShard == null && currentContigFilePointer != null ) {
|
||||
|
||||
// If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one):
|
||||
if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) {
|
||||
|
||||
// Close the old, exhausted chain of iterators to release resources
|
||||
currentContigReadsIterator.close();
|
||||
|
||||
// Advance to the FilePointer for the next contig
|
||||
createNextContigFilePointer();
|
||||
|
||||
// We'll need to create a fresh iterator for this file pointer when we create the first
|
||||
// shard for it below.
|
||||
currentContigReadsIterator = null;
|
||||
}
|
||||
|
||||
// At this point our currentContigReadsIterator may be null or non-null depending on whether or not
|
||||
// this is our first shard for this file pointer.
|
||||
if ( currentContigFilePointer != null ) {
|
||||
Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped);
|
||||
|
||||
// Create a new reads iterator only when we've just advanced to the file pointer for the next
|
||||
// contig. It's essential that the iterators persist across all shards that share the same contig
|
||||
// to allow the downsampling to work properly.
|
||||
if ( currentContigReadsIterator == null ) {
|
||||
currentContigReadsIterator = new PeekableIterator<SAMRecord>(readsDataSource.getIterator(shard));
|
||||
}
|
||||
|
||||
if ( currentContigReadsIterator.hasNext() ) {
|
||||
shard.fill(currentContigReadsIterator);
|
||||
nextShard = shard;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Aggregate all FilePointers for the next contig together into one monolithic FilePointer
|
||||
* to avoid boundary issues with visiting the same file regions more than once (since more
|
||||
* granular FilePointers will have regions that overlap with other nearby FilePointers due
|
||||
* to the nature of BAM indices).
|
||||
*
|
||||
* By creating one persistent set of iterators per contig we also avoid boundary artifacts
|
||||
* in the engine-level downsampling.
|
||||
*
|
||||
* TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for
|
||||
* TODO: read traversals, as there's little point in the BAMSchedule emitting extremely
|
||||
* TODO: granular FilePointers if we're just going to union them. The BAMSchedule should
|
||||
* TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for
|
||||
* TODO: locus traversals).
|
||||
*/
|
||||
private void createNextContigFilePointer() {
|
||||
currentContigFilePointer = null;
|
||||
List<FilePointer> nextContigFilePointers = new ArrayList<FilePointer>();
|
||||
|
||||
logger.info("Loading BAM index data for next contig");
|
||||
|
||||
while ( filePointers.hasNext() ) {
|
||||
|
||||
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
|
||||
// it is the ONLY FilePointer we ever encounter
|
||||
if ( encounteredMonolithicFilePointer ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
|
||||
}
|
||||
if ( filePointers.peek().isMonolithic() ) {
|
||||
if ( totalFilePointersConsumed > 0 ) {
|
||||
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
|
||||
}
|
||||
encounteredMonolithicFilePointer = true;
|
||||
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
|
||||
}
|
||||
|
||||
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
|
||||
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
|
||||
if ( nextContigFilePointers.isEmpty() ||
|
||||
(! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped &&
|
||||
nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) ||
|
||||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
|
||||
|
||||
nextContigFilePointers.add(filePointers.next());
|
||||
totalFilePointersConsumed++;
|
||||
}
|
||||
else {
|
||||
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
|
||||
// save it for next time
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! nextContigFilePointers.isEmpty() ) {
|
||||
currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser);
|
||||
}
|
||||
|
||||
if ( currentContigFilePointer != null ) {
|
||||
logger.info("Done loading BAM index data for next contig");
|
||||
logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer));
|
||||
}
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
|
@ -48,18 +50,87 @@ public class FilePointer {
|
|||
*/
|
||||
protected final boolean isRegionUnmapped;
|
||||
|
||||
public FilePointer(final GenomeLoc... locations) {
|
||||
this.locations.addAll(Arrays.asList(locations));
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*/
|
||||
private boolean isMonolithic = false;
|
||||
|
||||
/**
|
||||
* Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers
|
||||
*/
|
||||
private Integer contigIndex = null;
|
||||
|
||||
|
||||
public FilePointer( List<GenomeLoc> locations ) {
|
||||
this.locations.addAll(locations);
|
||||
this.isRegionUnmapped = checkUnmappedStatus();
|
||||
|
||||
validateAllLocations();
|
||||
if ( locations.size() > 0 ) {
|
||||
contigIndex = locations.get(0).getContigIndex();
|
||||
}
|
||||
}
|
||||
|
||||
public FilePointer( final GenomeLoc... locations ) {
|
||||
this(Arrays.asList(locations));
|
||||
}
|
||||
|
||||
public FilePointer( Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> locations ) {
|
||||
this(locations);
|
||||
this.fileSpans.putAll(fileSpans);
|
||||
}
|
||||
|
||||
private boolean checkUnmappedStatus() {
|
||||
boolean foundMapped = false, foundUnmapped = false;
|
||||
for(GenomeLoc location: locations) {
|
||||
if(GenomeLoc.isUnmapped(location))
|
||||
|
||||
for( GenomeLoc location: locations ) {
|
||||
if ( GenomeLoc.isUnmapped(location) )
|
||||
foundUnmapped = true;
|
||||
else
|
||||
foundMapped = true;
|
||||
}
|
||||
if(foundMapped && foundUnmapped)
|
||||
if ( foundMapped && foundUnmapped )
|
||||
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
|
||||
this.isRegionUnmapped = foundUnmapped;
|
||||
|
||||
return foundUnmapped;
|
||||
}
|
||||
|
||||
private void validateAllLocations() {
|
||||
// Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
|
||||
if ( isRegionUnmapped || isMonolithic ) {
|
||||
return;
|
||||
}
|
||||
|
||||
Integer previousContigIndex = null;
|
||||
|
||||
for ( GenomeLoc location : locations ) {
|
||||
if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) {
|
||||
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
|
||||
}
|
||||
|
||||
previousContigIndex = location.getContigIndex();
|
||||
}
|
||||
}
|
||||
|
||||
private void validateLocation( GenomeLoc location ) {
|
||||
if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) {
|
||||
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
|
||||
}
|
||||
if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) {
|
||||
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an immutable view of this FilePointer's file spans
|
||||
*
|
||||
* @return an immutable view of this FilePointer's file spans
|
||||
*/
|
||||
public Map<SAMReaderID, SAMFileSpan> getFileSpans() {
|
||||
return Collections.unmodifiableMap(fileSpans);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -70,6 +141,39 @@ public class FilePointer {
|
|||
return Collections.unmodifiableList(locations);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the index of the contig into which this FilePointer points (a FilePointer can represent
|
||||
* regions in at most one contig).
|
||||
*
|
||||
* @return the index of the contig into which this FilePointer points
|
||||
*/
|
||||
public int getContigIndex() {
|
||||
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
|
||||
}
|
||||
|
||||
/**
|
||||
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
|
||||
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
|
||||
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
|
||||
* from more than one contig.
|
||||
*
|
||||
* @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
|
||||
*/
|
||||
public boolean isMonolithic() {
|
||||
return isMonolithic;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
|
||||
* regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
|
||||
* FP may contain intervals from more than one contig.
|
||||
*
|
||||
* @param isMonolithic set this FP's monolithic status to this value
|
||||
*/
|
||||
public void setIsMonolithic( boolean isMonolithic ) {
|
||||
this.isMonolithic = isMonolithic;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(final Object other) {
|
||||
if(!(other instanceof FilePointer))
|
||||
|
|
@ -98,7 +202,12 @@ public class FilePointer {
|
|||
}
|
||||
|
||||
public void addLocation(final GenomeLoc location) {
|
||||
locations.add(location);
|
||||
validateLocation(location);
|
||||
|
||||
this.locations.add(location);
|
||||
if ( contigIndex == null ) {
|
||||
contigIndex = location.getContigIndex();
|
||||
}
|
||||
}
|
||||
|
||||
public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) {
|
||||
|
|
@ -216,6 +325,84 @@ public class FilePointer {
|
|||
combined.addFileSpans(initialElement.getKey(),fileSpan);
|
||||
}
|
||||
|
||||
/**
|
||||
* Efficiently generate the union of the n FilePointers passed in. Much more efficient than
|
||||
* combining two FilePointers at a time using the combine() method above.
|
||||
*
|
||||
* IMPORTANT: the FilePointers to be unioned must either all represent regions on the
|
||||
* same contig, or all be unmapped, since we cannot create FilePointers with a mix of
|
||||
* contigs or with mixed mapped/unmapped regions.
|
||||
*
|
||||
* @param filePointers the FilePointers to union
|
||||
* @param parser our GenomeLocParser
|
||||
* @return the union of the FilePointers passed in
|
||||
*/
|
||||
public static FilePointer union( List<FilePointer> filePointers, GenomeLocParser parser ) {
|
||||
if ( filePointers == null || filePointers.isEmpty() ) {
|
||||
return new FilePointer();
|
||||
}
|
||||
|
||||
Map<SAMReaderID, List<GATKChunk>> fileChunks = new HashMap<SAMReaderID, List<GATKChunk>>();
|
||||
List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
|
||||
|
||||
// First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections
|
||||
for ( FilePointer filePointer : filePointers ) {
|
||||
locations.addAll(filePointer.getLocations());
|
||||
|
||||
for ( Map.Entry<SAMReaderID, SAMFileSpan> fileSpanEntry : filePointer.getFileSpans().entrySet() ) {
|
||||
GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue();
|
||||
|
||||
if ( fileChunks.containsKey(fileSpanEntry.getKey()) ) {
|
||||
fileChunks.get(fileSpanEntry.getKey()).addAll(fileSpan.getGATKChunks());
|
||||
}
|
||||
else {
|
||||
fileChunks.put(fileSpanEntry.getKey(), fileSpan.getGATKChunks());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Now sort and merge the intervals
|
||||
List<GenomeLoc> sortedMergedLocations = new ArrayList<GenomeLoc>();
|
||||
sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, IntervalMergingRule.ALL));
|
||||
|
||||
// For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing
|
||||
// the sorted, merged union of the chunks for that file
|
||||
Map<SAMReaderID, SAMFileSpan> mergedFileSpans = new HashMap<SAMReaderID, SAMFileSpan>(fileChunks.size());
|
||||
for ( Map.Entry<SAMReaderID, List<GATKChunk>> fileChunksEntry : fileChunks.entrySet() ) {
|
||||
List<GATKChunk> unmergedChunks = fileChunksEntry.getValue();
|
||||
mergedFileSpans.put(fileChunksEntry.getKey(),
|
||||
(new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan()));
|
||||
}
|
||||
|
||||
return new FilePointer(mergedFileSpans, sortedMergedLocations);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if any of the file spans in this FilePointer overlap their counterparts in
|
||||
* the other FilePointer. "Overlap" is defined as having an overlapping extent (the region
|
||||
* from the start of the first chunk to the end of the last chunk).
|
||||
*
|
||||
* @param other the FilePointer against which to check overlap with this FilePointer
|
||||
* @return true if any file spans overlap their counterparts in other, otherwise false
|
||||
*/
|
||||
public boolean hasFileSpansOverlappingWith( FilePointer other ) {
|
||||
for ( Map.Entry<SAMReaderID, SAMFileSpan> thisFilePointerEntry : fileSpans.entrySet() ) {
|
||||
GATKBAMFileSpan thisFileSpan = new GATKBAMFileSpan(thisFilePointerEntry.getValue());
|
||||
|
||||
SAMFileSpan otherEntry = other.fileSpans.get(thisFilePointerEntry.getKey());
|
||||
if ( otherEntry == null ) {
|
||||
continue; // no counterpart for this file span in other
|
||||
}
|
||||
GATKBAMFileSpan otherFileSpan = new GATKBAMFileSpan(otherEntry);
|
||||
|
||||
if ( thisFileSpan.getExtent().overlaps(otherFileSpan.getExtent()) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
|
|
|
|||
|
|
@ -73,8 +73,15 @@ public class IntervalSharder implements Iterator<FilePointer> {
|
|||
*/
|
||||
public FilePointer next() {
|
||||
FilePointer current = wrappedIterator.next();
|
||||
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
|
||||
|
||||
while ( wrappedIterator.hasNext() &&
|
||||
current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped &&
|
||||
(current.getContigIndex() == wrappedIterator.peek().getContigIndex() || current.isRegionUnmapped) &&
|
||||
current.minus(wrappedIterator.peek()) == 0 ) {
|
||||
|
||||
current = current.combine(parser,wrappedIterator.next());
|
||||
}
|
||||
|
||||
return current;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -42,8 +42,10 @@ public class LocusShardBalancer extends ShardBalancer {
|
|||
|
||||
public Shard next() {
|
||||
FilePointer current = filePointers.next();
|
||||
while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
|
||||
current = current.combine(parser,filePointers.next());
|
||||
|
||||
// FilePointers have already been combined as necessary at the IntervalSharder level. No
|
||||
// need to do so again here.
|
||||
|
||||
return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,16 +1,15 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
*
|
||||
|
|
@ -35,10 +34,21 @@ import java.util.Map;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class ReadShard extends Shard {
|
||||
|
||||
/**
|
||||
* Default read shard buffer size
|
||||
*/
|
||||
public static final int DEFAULT_MAX_READS = 10000;
|
||||
|
||||
/**
|
||||
* What is the maximum number of reads per BAM file which should go into a read shard.
|
||||
*
|
||||
* TODO: this non-final static variable should either be made final or turned into an
|
||||
* TODO: instance variable somewhere -- as both static and mutable it wreaks havoc
|
||||
* TODO: with tests that use multiple instances of SAMDataSource (since SAMDataSource
|
||||
* TODO: changes this value)
|
||||
*/
|
||||
public static int MAX_READS = 10000;
|
||||
public static int MAX_READS = DEFAULT_MAX_READS;
|
||||
|
||||
/**
|
||||
* The reads making up this shard.
|
||||
|
|
@ -52,12 +62,24 @@ public class ReadShard extends Shard {
|
|||
/**
|
||||
* Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface
|
||||
* until we know what effect tuning this parameter has.
|
||||
*
|
||||
* TODO: this mutable static interface is awful and breaks tests -- need to refactor
|
||||
*
|
||||
* @param bufferSize New maximum number
|
||||
*/
|
||||
static void setReadBufferSize(final int bufferSize) {
|
||||
MAX_READS = bufferSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* What read buffer size are we using?
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static int getReadBufferSize() {
|
||||
return MAX_READS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this shard is meant to buffer reads, rather
|
||||
* than just holding pointers to their locations.
|
||||
|
|
@ -93,6 +115,67 @@ public class ReadShard extends Shard {
|
|||
reads.add(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills this shard's buffer with reads from the iterator passed in
|
||||
*
|
||||
* @param readIter Iterator from which to draw the reads to fill the shard
|
||||
*/
|
||||
@Override
|
||||
public void fill( PeekableIterator<SAMRecord> readIter ) {
|
||||
if( ! buffersReads() )
|
||||
throw new ReviewedStingException("Attempting to fill a non-buffering shard.");
|
||||
|
||||
SAMFileHeader.SortOrder sortOrder = getReadProperties().getSortOrder();
|
||||
SAMRecord read = null;
|
||||
|
||||
while( ! isBufferFull() && readIter.hasNext() ) {
|
||||
final SAMRecord nextRead = readIter.peek();
|
||||
if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) {
|
||||
// only add reads to the shard if they are on the same contig
|
||||
read = readIter.next();
|
||||
addRead(read);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If the reads are sorted in coordinate order, ensure that all reads
|
||||
// having the same alignment start become part of the same shard, to allow
|
||||
// downsampling to work better across shard boundaries. Note that because our
|
||||
// read stream has already been fed through the positional downsampler, which
|
||||
// ensures that at each alignment start position there are no more than dcov
|
||||
// reads, we're in no danger of accidentally creating a disproportionately huge
|
||||
// shard
|
||||
if ( sortOrder == SAMFileHeader.SortOrder.coordinate ) {
|
||||
while ( readIter.hasNext() ) {
|
||||
SAMRecord additionalRead = readIter.peek();
|
||||
|
||||
// Stop filling the shard as soon as we encounter a read having a different
|
||||
// alignment start or contig from the last read added in the earlier loop
|
||||
// above, or an unmapped read
|
||||
if ( read == null ||
|
||||
additionalRead.getReadUnmappedFlag() ||
|
||||
! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) ||
|
||||
additionalRead.getAlignmentStart() != read.getAlignmentStart() ) {
|
||||
break;
|
||||
}
|
||||
|
||||
addRead(readIter.next());
|
||||
}
|
||||
}
|
||||
|
||||
// If the reads are sorted in queryname order, ensure that all reads
|
||||
// having the same queryname become part of the same shard.
|
||||
if( sortOrder == SAMFileHeader.SortOrder.queryname ) {
|
||||
while( readIter.hasNext() ) {
|
||||
SAMRecord nextRead = readIter.peek();
|
||||
if( read == null || ! read.getReadName().equals(nextRead.getReadName()) )
|
||||
break;
|
||||
addRead(readIter.next());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an iterator over reads stored in this shard's read cache.
|
||||
* @return
|
||||
|
|
@ -116,4 +199,48 @@ public class ReadShard extends Shard {
|
|||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the full span from the start of the left most read to the end of the right most one
|
||||
*
|
||||
* Note this may be different than the getLocation() of the shard, as this reflects the
|
||||
* targeted span, not the actual span of reads
|
||||
*
|
||||
* @return the genome loc representing the span of these reads on the genome
|
||||
*/
|
||||
public GenomeLoc getReadsSpan() {
|
||||
if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() )
|
||||
return super.getLocation();
|
||||
else {
|
||||
int start = Integer.MAX_VALUE;
|
||||
int stop = Integer.MIN_VALUE;
|
||||
String contig = null;
|
||||
boolean foundMapped = false;
|
||||
|
||||
for ( final SAMRecord read : reads ) {
|
||||
if ( contig != null && ! read.getReferenceName().equals(contig) )
|
||||
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
|
||||
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
|
||||
contig = read.getReferenceName();
|
||||
|
||||
// Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates
|
||||
// of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries,
|
||||
// this shard might consist *only* of unmapped mates! We need to refrain from using the alignment
|
||||
// starts/stops of these unmapped mates, and detect the case where the shard has been filled *only*
|
||||
// with unmapped mates.
|
||||
if ( ! read.getReadUnmappedFlag() ) {
|
||||
foundMapped = true;
|
||||
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
|
||||
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
|
||||
}
|
||||
}
|
||||
|
||||
assert contig != null;
|
||||
|
||||
if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped
|
||||
return GenomeLoc.UNMAPPED;
|
||||
else
|
||||
return parser.createGenomeLoc(contig, start, stop);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@ import java.util.NoSuchElementException;
|
|||
|
||||
/**
|
||||
* Divide up large file pointers containing reads into more manageable subcomponents.
|
||||
*
|
||||
* TODO: delete this class once the experimental downsampling engine fork collapses
|
||||
*/
|
||||
public class ReadShardBalancer extends ShardBalancer {
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -24,14 +24,15 @@
|
|||
|
||||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import net.sf.picard.sam.MergingSamRecordIterator;
|
||||
import net.sf.picard.sam.SamFileHeaderMerger;
|
||||
import net.sf.samtools.*;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import net.sf.samtools.util.RuntimeIOException;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.downsampling.*;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
|
||||
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.ReadMetrics;
|
||||
import org.broadinstitute.sting.gatk.ReadProperties;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
|
|
@ -42,12 +43,9 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
|
|||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.SimpleTimer;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
|
||||
import org.broadinstitute.sting.utils.baq.ReadTransformingIterator;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -101,6 +99,8 @@ public class SAMDataSource {
|
|||
|
||||
/**
|
||||
* How far along is each reader?
|
||||
*
|
||||
* TODO: delete this once the experimental downsampling engine fork collapses
|
||||
*/
|
||||
private final Map<SAMReaderID,GATKBAMFileSpan> readerPositions = new HashMap<SAMReaderID,GATKBAMFileSpan>();
|
||||
|
||||
|
|
@ -200,11 +200,8 @@ public class SAMDataSource {
|
|||
downsamplingMethod,
|
||||
exclusionList,
|
||||
supplementalFilters,
|
||||
Collections.<ReadTransformer>emptyList(),
|
||||
includeReadsWithDeletionAtLoci,
|
||||
BAQ.CalculationMode.OFF,
|
||||
BAQ.QualityMode.DONT_MODIFY,
|
||||
null, // no BAQ
|
||||
null, // no BQSR
|
||||
(byte) -1,
|
||||
false);
|
||||
}
|
||||
|
|
@ -234,11 +231,8 @@ public class SAMDataSource {
|
|||
DownsamplingMethod downsamplingMethod,
|
||||
ValidationExclusion exclusionList,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
List<ReadTransformer> readTransformers,
|
||||
boolean includeReadsWithDeletionAtLoci,
|
||||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
byte defaultBaseQualities,
|
||||
boolean removeProgramRecords) {
|
||||
this.readMetrics = new ReadMetrics();
|
||||
|
|
@ -258,11 +252,11 @@ public class SAMDataSource {
|
|||
validationStringency = strictness;
|
||||
this.removeProgramRecords = removeProgramRecords;
|
||||
if(readBufferSize != null)
|
||||
ReadShard.setReadBufferSize(readBufferSize);
|
||||
ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests
|
||||
else {
|
||||
// Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively
|
||||
// will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once.
|
||||
ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000));
|
||||
ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000));
|
||||
}
|
||||
|
||||
resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
|
||||
|
|
@ -303,16 +297,14 @@ public class SAMDataSource {
|
|||
readProperties = new ReadProperties(
|
||||
samFiles,
|
||||
mergedHeader,
|
||||
sortOrder,
|
||||
useOriginalBaseQualities,
|
||||
strictness,
|
||||
downsamplingMethod,
|
||||
exclusionList,
|
||||
supplementalFilters,
|
||||
readTransformers,
|
||||
includeReadsWithDeletionAtLoci,
|
||||
cmode,
|
||||
qmode,
|
||||
refReader,
|
||||
bqsrApplier,
|
||||
defaultBaseQualities);
|
||||
|
||||
// cache the read group id (original) -> read group id (merged)
|
||||
|
|
@ -388,7 +380,10 @@ public class SAMDataSource {
|
|||
/**
|
||||
* Retrieves the current position within the BAM file.
|
||||
* @return A mapping of reader to current position.
|
||||
*
|
||||
* TODO: delete this once the experimental downsampling engine fork collapses
|
||||
*/
|
||||
@Deprecated
|
||||
public Map<SAMReaderID,GATKBAMFileSpan> getCurrentPosition() {
|
||||
return readerPositions;
|
||||
}
|
||||
|
|
@ -471,9 +466,15 @@ public class SAMDataSource {
|
|||
}
|
||||
|
||||
/**
|
||||
* Fill the given buffering shard with reads.
|
||||
* Legacy method to fill the given buffering shard with reads.
|
||||
*
|
||||
* Shard.fill() is used instead of this method when experimental downsampling is enabled
|
||||
*
|
||||
* TODO: delete this method once the experimental downsampling engine fork collapses
|
||||
*
|
||||
* @param shard Shard to fill.
|
||||
*/
|
||||
@Deprecated
|
||||
public void fillShard(Shard shard) {
|
||||
if(!shard.buffersReads())
|
||||
throw new ReviewedStingException("Attempting to fill a non-buffering shard.");
|
||||
|
|
@ -486,9 +487,15 @@ public class SAMDataSource {
|
|||
|
||||
CloseableIterator<SAMRecord> iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate);
|
||||
while(!shard.isBufferFull() && iterator.hasNext()) {
|
||||
read = iterator.next();
|
||||
shard.addRead(read);
|
||||
noteFilePositionUpdate(positionUpdates,read);
|
||||
final SAMRecord nextRead = iterator.next();
|
||||
if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) {
|
||||
// only add reads to the shard if they are on the same contig
|
||||
read = nextRead;
|
||||
shard.addRead(read);
|
||||
noteFilePositionUpdate(positionUpdates,read);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If the reads are sorted in queryname order, ensure that all reads
|
||||
|
|
@ -510,6 +517,10 @@ public class SAMDataSource {
|
|||
readerPositions.put(readers.getReaderID(positionUpdate.getKey()),positionUpdate.getValue());
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: delete this method once the experimental downsampling engine fork collapses
|
||||
*/
|
||||
@Deprecated
|
||||
private void noteFilePositionUpdate(Map<SAMFileReader,GATKBAMFileSpan> positionMapping, SAMRecord read) {
|
||||
GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing());
|
||||
positionMapping.put(read.getFileSource().getReader(),endChunk);
|
||||
|
|
@ -520,8 +531,7 @@ public class SAMDataSource {
|
|||
return shard.iterator();
|
||||
}
|
||||
else {
|
||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
return getIterator(readers,shard,shard instanceof ReadShard);
|
||||
return getIterator(shard);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -541,13 +551,44 @@ public class SAMDataSource {
|
|||
|
||||
/**
|
||||
* Initialize the current reader positions
|
||||
*
|
||||
* TODO: delete this once the experimental downsampling engine fork collapses
|
||||
*
|
||||
* @param readers
|
||||
*/
|
||||
@Deprecated
|
||||
private void initializeReaderPositions(SAMReaders readers) {
|
||||
for(SAMReaderID id: getReaderIDs())
|
||||
readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the initial reader positions across all BAM files
|
||||
*
|
||||
* @return the start positions of the first chunk of reads for all BAM files
|
||||
*/
|
||||
public Map<SAMReaderID, GATKBAMFileSpan> getInitialReaderPositions() {
|
||||
Map<SAMReaderID, GATKBAMFileSpan> initialPositions = new HashMap<SAMReaderID, GATKBAMFileSpan>();
|
||||
SAMReaders readers = resourcePool.getAvailableReaders();
|
||||
|
||||
for ( SAMReaderID id: getReaderIDs() ) {
|
||||
initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
|
||||
}
|
||||
|
||||
resourcePool.releaseReaders(readers);
|
||||
return initialPositions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an iterator over the data types specified in the shard.
|
||||
*
|
||||
* @param shard The shard specifying the data limits.
|
||||
* @return An iterator over the selected data.
|
||||
*/
|
||||
public StingSAMIterator getIterator( Shard shard ) {
|
||||
return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get an iterator over the data types specified in the shard.
|
||||
* @param readers Readers from which to load data.
|
||||
|
|
@ -585,6 +626,7 @@ public class SAMDataSource {
|
|||
iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
|
||||
if(shard.getGenomeLocs().size() > 0)
|
||||
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
|
||||
|
||||
iteratorMap.put(readers.getReader(id), iterator);
|
||||
}
|
||||
|
||||
|
|
@ -597,10 +639,7 @@ public class SAMDataSource {
|
|||
readProperties.getDownsamplingMethod().toFraction,
|
||||
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
|
||||
readProperties.getSupplementalFilters(),
|
||||
readProperties.getBAQCalculationMode(),
|
||||
readProperties.getBAQQualityMode(),
|
||||
readProperties.getRefReader(),
|
||||
readProperties.getBQSRApplier(),
|
||||
readProperties.getReadTransformers(),
|
||||
readProperties.defaultBaseQualities());
|
||||
}
|
||||
|
||||
|
|
@ -667,40 +706,62 @@ public class SAMDataSource {
|
|||
Double downsamplingFraction,
|
||||
Boolean noValidationOfReadOrder,
|
||||
Collection<ReadFilter> supplementalFilters,
|
||||
BAQ.CalculationMode cmode,
|
||||
BAQ.QualityMode qmode,
|
||||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
List<ReadTransformer> readTransformers,
|
||||
byte defaultBaseQualities) {
|
||||
|
||||
// *********************************************************************************** //
|
||||
// * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
|
||||
// * (otherwise we will process something that we may end up throwing away) * //
|
||||
// *********************************************************************************** //
|
||||
// ************************************************************************************************ //
|
||||
// * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
|
||||
// * (otherwise we will process something that we may end up throwing away) * //
|
||||
// ************************************************************************************************ //
|
||||
|
||||
if (downsamplingFraction != null)
|
||||
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||
|
||||
if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) {
|
||||
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
|
||||
}
|
||||
|
||||
// Use the old fractional downsampler only if we're not using experimental downsampling:
|
||||
if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null )
|
||||
wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
|
||||
// unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
|
||||
// verify the read ordering by applying a sort order iterator
|
||||
if (!noValidationOfReadOrder && enableVerification)
|
||||
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
|
||||
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||
wrappedIterator = new VerifyingSamIterator(wrappedIterator);
|
||||
|
||||
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
|
||||
// only wrap if we are replacing the original qualities or using a default base quality
|
||||
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
|
||||
|
||||
if (bqsrApplier != null)
|
||||
wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier);
|
||||
|
||||
if (cmode != BAQ.CalculationMode.OFF)
|
||||
wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode);
|
||||
// set up read transformers
|
||||
for ( final ReadTransformer readTransformer : readTransformers ) {
|
||||
if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT )
|
||||
wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer);
|
||||
}
|
||||
|
||||
return wrappedIterator;
|
||||
}
|
||||
|
||||
protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) {
|
||||
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
|
||||
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ?
|
||||
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
|
||||
new FractionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
|
||||
|
||||
return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory);
|
||||
}
|
||||
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
|
||||
ReadsDownsampler<SAMRecord> downsampler = readProperties.getDownsamplingMethod().toCoverage != null ?
|
||||
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
|
||||
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
|
||||
|
||||
return new DownsamplingReadsIterator(wrappedIterator, downsampler);
|
||||
}
|
||||
|
||||
return wrappedIterator;
|
||||
}
|
||||
|
||||
|
||||
private class SAMResourcePool {
|
||||
/**
|
||||
* How many entries can be cached in this resource pool?
|
||||
|
|
@ -947,6 +1008,12 @@ public class SAMDataSource {
|
|||
} catch ( SAMFormatException e ) {
|
||||
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
|
||||
}
|
||||
// Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files).
|
||||
// Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case,
|
||||
// just in case we want to change this behavior later.
|
||||
catch ( RuntimeException e ) {
|
||||
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
|
||||
}
|
||||
reader.setSAMRecordFactory(factory);
|
||||
reader.enableFileSource(true);
|
||||
reader.setValidationStringency(validationStringency);
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.ReadMetrics;
|
||||
|
|
@ -203,6 +204,12 @@ public abstract class Shard implements HasGenomeLocation {
|
|||
*/
|
||||
public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); }
|
||||
|
||||
/**
|
||||
* Fills the shard with reads. Can only do this with shards that buffer reads
|
||||
* @param readIter Iterator from which to draw the reads to fill the shard
|
||||
*/
|
||||
public void fill( PeekableIterator<SAMRecord> readIter ) { throw new UnsupportedOperationException("This shard does not buffer reads."); }
|
||||
|
||||
/**
|
||||
* Gets the iterator over the elements cached in the shard.
|
||||
* @return
|
||||
|
|
|
|||
|
|
@ -34,8 +34,10 @@ import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Type;
|
||||
import java.util.List;
|
||||
|
|
@ -239,6 +241,8 @@ class ReferenceOrderedQueryDataPool extends ResourcePool<RMDTrack,LocationAwareS
|
|||
} else {
|
||||
return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.getIterator());
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
throw new UserException.CouldNotReadInputFile(fileDescriptor.getName(), "it could not be found");
|
||||
} catch (IOException e) {
|
||||
throw new ReviewedStingException("Unable to create iterator for rod named " + fileDescriptor.getName(),e);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.gatk;
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
/**
|
||||
* Type of downsampling method to invoke.
|
||||
|
|
@ -28,49 +28,92 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The basic downsampler API, with no reads-specific operations
|
||||
* The basic downsampler API, with no reads-specific operations.
|
||||
*
|
||||
* Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle
|
||||
* any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a
|
||||
* PerSampleDownsamplingReadsIterator.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface Downsampler<T> {
|
||||
|
||||
/*
|
||||
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
|
||||
/**
|
||||
* Submit one item to the downsampler for consideration. Some downsamplers will be able to determine
|
||||
* immediately whether the item survives the downsampling process, while others will need to see
|
||||
* more items before making that determination.
|
||||
*
|
||||
* @param item the individual item to submit to the downsampler for consideration
|
||||
*/
|
||||
public void submit( T item );
|
||||
|
||||
/*
|
||||
* Submit a collection of items to the downsampler for consideration.
|
||||
/**
|
||||
* Submit a collection of items to the downsampler for consideration. Should be equivalent to calling
|
||||
* submit() on each individual item in the collection.
|
||||
*
|
||||
* @param items the collection of items to submit to the downsampler for consideration
|
||||
*/
|
||||
public void submit( Collection<T> items );
|
||||
|
||||
/*
|
||||
/**
|
||||
* Are there items that have survived the downsampling process waiting to be retrieved?
|
||||
*
|
||||
* @return true if this downsampler has > 0 finalized items, otherwise false
|
||||
*/
|
||||
public boolean hasDownsampledItems();
|
||||
public boolean hasFinalizedItems();
|
||||
|
||||
/*
|
||||
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
|
||||
/**
|
||||
* Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved.
|
||||
*
|
||||
* @return a list of all finalized items this downsampler contains, or an empty list if there are none
|
||||
*/
|
||||
public List<T> consumeDownsampledItems();
|
||||
public List<T> consumeFinalizedItems();
|
||||
|
||||
/*
|
||||
/**
|
||||
* Are there items stored in this downsampler that it doesn't yet know whether they will
|
||||
* ultimately survive the downsampling process?
|
||||
*
|
||||
* @return true if this downsampler has > 0 pending items, otherwise false
|
||||
*/
|
||||
public boolean hasPendingItems();
|
||||
|
||||
/*
|
||||
/**
|
||||
* Peek at the first finalized item stored in this downsampler (or null if there are no finalized items)
|
||||
*
|
||||
* @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call),
|
||||
* or null if there are none
|
||||
*/
|
||||
public T peekFinalized();
|
||||
|
||||
/**
|
||||
* Peek at the first pending item stored in this downsampler (or null if there are no pending items)
|
||||
*
|
||||
* @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call),
|
||||
* or null if there are none
|
||||
*/
|
||||
public T peekPending();
|
||||
|
||||
/**
|
||||
* Returns the number of items discarded (so far) during the downsampling process
|
||||
*
|
||||
* @return the number of items that have been submitted to this downsampler and discarded in the process of
|
||||
* downsampling
|
||||
*/
|
||||
public int getNumberOfDiscardedItems();
|
||||
|
||||
/**
|
||||
* Used to tell the downsampler that no more items will be submitted to it, and that it should
|
||||
* finalize any pending items.
|
||||
*/
|
||||
public void signalEndOfInput();
|
||||
|
||||
/*
|
||||
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
|
||||
* information.
|
||||
/**
|
||||
* Empty the downsampler of all finalized/pending items
|
||||
*/
|
||||
public void clear();
|
||||
|
||||
/**
|
||||
* Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items
|
||||
*/
|
||||
public void reset();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,153 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
/**
|
||||
* Describes the method for downsampling reads at a given locus.
|
||||
*/
|
||||
|
||||
public class DownsamplingMethod {
|
||||
/**
|
||||
* Type of downsampling to perform.
|
||||
*/
|
||||
public final DownsampleType type;
|
||||
|
||||
/**
|
||||
* Actual downsampling target is specified as an integer number of reads.
|
||||
*/
|
||||
public final Integer toCoverage;
|
||||
|
||||
/**
|
||||
* Actual downsampling target is specified as a fraction of total available reads.
|
||||
*/
|
||||
public final Double toFraction;
|
||||
|
||||
/**
|
||||
* Use the new experimental downsampling?
|
||||
*/
|
||||
public final boolean useExperimentalDownsampling;
|
||||
|
||||
/**
|
||||
* Expresses no downsampling applied at all.
|
||||
*/
|
||||
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false);
|
||||
|
||||
/**
|
||||
* Default type to use if no type is specified
|
||||
*/
|
||||
public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
|
||||
|
||||
/**
|
||||
* Default target coverage for locus-based traversals
|
||||
*/
|
||||
public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000;
|
||||
|
||||
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) {
|
||||
this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE;
|
||||
this.toCoverage = toCoverage;
|
||||
this.toFraction = toFraction;
|
||||
this.useExperimentalDownsampling = useExperimentalDownsampling;
|
||||
|
||||
if ( type == DownsampleType.NONE ) {
|
||||
toCoverage = null;
|
||||
toFraction = null;
|
||||
}
|
||||
|
||||
validate();
|
||||
}
|
||||
|
||||
private void validate() {
|
||||
// Can't leave toFraction and toCoverage null unless type is NONE
|
||||
if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null )
|
||||
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
|
||||
|
||||
// Fraction and coverage cannot both be specified.
|
||||
if ( toFraction != null && toCoverage != null )
|
||||
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
|
||||
|
||||
// toCoverage must be > 0 when specified
|
||||
if ( toCoverage != null && toCoverage <= 0 ) {
|
||||
throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage");
|
||||
}
|
||||
|
||||
// toFraction must be >= 0.0 and <= 1.0 when specified
|
||||
if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) {
|
||||
throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads");
|
||||
}
|
||||
|
||||
// Some restrictions only exist for the old downsampling implementation:
|
||||
if ( ! useExperimentalDownsampling ) {
|
||||
// By sample downsampling does not work with a fraction of reads in the old downsampling implementation
|
||||
if( type == DownsampleType.BY_SAMPLE && toFraction != null )
|
||||
throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method");
|
||||
}
|
||||
|
||||
// Some restrictions only exist for the new downsampling implementation:
|
||||
if ( useExperimentalDownsampling ) {
|
||||
if ( type == DownsampleType.ALL_READS && toCoverage != null ) {
|
||||
throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder("Downsampling Settings: ");
|
||||
|
||||
if ( type == DownsampleType.NONE ) {
|
||||
builder.append("No downsampling");
|
||||
}
|
||||
else {
|
||||
builder.append(String.format("Method: %s ", type));
|
||||
|
||||
if ( toCoverage != null ) {
|
||||
builder.append(String.format("Target Coverage: %d ", toCoverage));
|
||||
}
|
||||
else {
|
||||
builder.append(String.format("Target Fraction: %.2f ", toFraction));
|
||||
}
|
||||
|
||||
if ( useExperimentalDownsampling ) {
|
||||
builder.append("Using Experimental Downsampling");
|
||||
}
|
||||
}
|
||||
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) {
|
||||
if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) {
|
||||
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE,
|
||||
null, useExperimentalDownsampling);
|
||||
}
|
||||
else {
|
||||
return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -33,7 +33,8 @@ import java.util.NoSuchElementException;
|
|||
|
||||
|
||||
/**
|
||||
* StingSAMIterator wrapper around our generic reads downsampler interface
|
||||
* StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style
|
||||
* downsampler interface to a pull model.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
|
|
@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
|
|||
private StingSAMIterator nestedSAMIterator;
|
||||
private ReadsDownsampler<SAMRecord> downsampler;
|
||||
private Collection<SAMRecord> downsampledReadsCache;
|
||||
private Iterator<SAMRecord> downsampledReadsCacheIterator;
|
||||
private SAMRecord nextRead = null;
|
||||
private Iterator<SAMRecord> downsampledReadsCacheIterator = null;
|
||||
|
||||
/**
|
||||
* @param iter wrapped iterator from which this iterator will pull reads
|
||||
* @param downsampler downsampler through which the reads will be fed
|
||||
*/
|
||||
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
|
||||
nestedSAMIterator = iter;
|
||||
this.downsampler = downsampler;
|
||||
fillDownsampledReadsCache();
|
||||
|
||||
advanceToNextRead();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if ( downsampledReadsCacheIterator.hasNext() ) {
|
||||
return true;
|
||||
}
|
||||
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return nextRead != null;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
|
||||
if ( nextRead == null ) {
|
||||
throw new NoSuchElementException("next() called when there are no more items");
|
||||
}
|
||||
|
||||
return downsampledReadsCacheIterator.next();
|
||||
SAMRecord toReturn = nextRead;
|
||||
advanceToNextRead();
|
||||
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
private void advanceToNextRead() {
|
||||
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
|
||||
nextRead = null;
|
||||
}
|
||||
else {
|
||||
nextRead = downsampledReadsCacheIterator.next();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean readyToReleaseReads() {
|
||||
return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext();
|
||||
}
|
||||
|
||||
private boolean fillDownsampledReadsCache() {
|
||||
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
|
||||
while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) {
|
||||
downsampler.submit(nestedSAMIterator.next());
|
||||
}
|
||||
|
||||
|
|
@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
|
|||
downsampler.signalEndOfInput();
|
||||
}
|
||||
|
||||
downsampledReadsCache = downsampler.consumeDownsampledItems();
|
||||
// use returned collection directly rather than make a copy, for speed
|
||||
downsampledReadsCache = downsampler.consumeFinalizedItems();
|
||||
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
|
||||
|
||||
return downsampledReadsCacheIterator.hasNext();
|
||||
|
|
|
|||
|
|
@ -33,7 +33,10 @@ import java.util.Collection;
|
|||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
|
||||
* Fractional Downsampler: selects a specified fraction of the reads for inclusion.
|
||||
*
|
||||
* Since the selection is done randomly, the actual fraction of reads retained may be slightly
|
||||
* more or less than the requested fraction, depending on the total number of reads submitted.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
|
|
@ -43,8 +46,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
|
||||
private int cutoffForInclusion;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
private static final int RANDOM_POOL_SIZE = 10000;
|
||||
|
||||
/**
|
||||
* Construct a FractionalDownsampler
|
||||
*
|
||||
* @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive).
|
||||
* Actual number of reads preserved may differ randomly.
|
||||
*/
|
||||
public FractionalDownsampler( double fraction ) {
|
||||
if ( fraction < 0.0 || fraction > 1.0 ) {
|
||||
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
|
||||
|
|
@ -52,12 +63,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
|
||||
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
|
||||
clear();
|
||||
reset();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
|
||||
selectedReads.add(newRead);
|
||||
}
|
||||
else {
|
||||
numDiscardedItems++;
|
||||
}
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
|
|
@ -66,11 +81,12 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
public boolean hasFinalizedItems() {
|
||||
return selectedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> downsampledItems = selectedReads;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
|
|
@ -80,6 +96,18 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
return false;
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return selectedReads.isEmpty() ? null : selectedReads.get(0);
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
|
@ -88,7 +116,15 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
|
|||
selectedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
// NO-OP
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Factory for creating FractionalDownsamplers on demand
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class FractionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
|
||||
|
||||
private double fraction;
|
||||
|
||||
public FractionalDownsamplerFactory( double fraction ) {
|
||||
this.fraction = fraction;
|
||||
}
|
||||
|
||||
public ReadsDownsampler<T> newInstance() {
|
||||
return new FractionalDownsampler<T>(fraction);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,212 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Leveling Downsampler: Given a set of Lists of arbitrary items and a target size, removes items from
|
||||
* the Lists in an even fashion until the total size of all Lists is <= the target size. Leveling
|
||||
* does not occur until all Lists have been submitted and signalEndOfInput() is called.
|
||||
*
|
||||
* The Lists should be LinkedLists for maximum efficiency during item removal, however other
|
||||
* kinds of Lists are also accepted (albeit at a slight performance penalty).
|
||||
*
|
||||
* Since this downsampler extends the Downsampler interface rather than the ReadsDownsampler interface,
|
||||
* the Lists need not contain reads. However this downsampler may not be wrapped within one of the
|
||||
* DownsamplingReadsIterators
|
||||
*
|
||||
* @param <T> the List type representing the stacks to be leveled
|
||||
* @param <E> the type of the elements of each List
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class LevelingDownsampler<T extends List<E>, E> implements Downsampler<T> {
|
||||
|
||||
private int targetSize;
|
||||
|
||||
private List<T> groups;
|
||||
|
||||
private boolean groupsAreFinalized;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
/**
|
||||
* Construct a LevelingDownsampler
|
||||
*
|
||||
* @param targetSize the sum of the sizes of all individual Lists this downsampler is fed may not exceed
|
||||
* this value -- if it does, items are removed from Lists evenly until the total size
|
||||
* is <= this value
|
||||
*/
|
||||
public LevelingDownsampler( int targetSize ) {
|
||||
this.targetSize = targetSize;
|
||||
clear();
|
||||
reset();
|
||||
}
|
||||
|
||||
public void submit( T item ) {
|
||||
groups.add(item);
|
||||
}
|
||||
|
||||
public void submit( Collection<T> items ){
|
||||
groups.addAll(items);
|
||||
}
|
||||
|
||||
public boolean hasFinalizedItems() {
|
||||
return groupsAreFinalized && groups.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeFinalizedItems() {
|
||||
if ( ! hasFinalizedItems() ) {
|
||||
return new ArrayList<T>();
|
||||
}
|
||||
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> toReturn = groups;
|
||||
clear();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return ! groupsAreFinalized && groups.size() > 0;
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return hasFinalizedItems() ? groups.get(0) : null;
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return hasPendingItems() ? groups.get(0) : null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
levelGroups();
|
||||
groupsAreFinalized = true;
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
groups = new ArrayList<T>();
|
||||
groupsAreFinalized = false;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
private void levelGroups() {
|
||||
int totalSize = 0;
|
||||
int[] groupSizes = new int[groups.size()];
|
||||
int currentGroupIndex = 0;
|
||||
|
||||
for ( T group : groups ) {
|
||||
groupSizes[currentGroupIndex] = group.size();
|
||||
totalSize += groupSizes[currentGroupIndex];
|
||||
currentGroupIndex++;
|
||||
}
|
||||
|
||||
if ( totalSize <= targetSize ) {
|
||||
return; // no need to eliminate any items
|
||||
}
|
||||
|
||||
// We will try to remove exactly this many items, however we will refuse to allow any
|
||||
// one group to fall below size 1, and so might end up removing fewer items than this
|
||||
int numItemsToRemove = totalSize - targetSize;
|
||||
|
||||
currentGroupIndex = 0;
|
||||
int numConsecutiveUmodifiableGroups = 0;
|
||||
|
||||
// Continue until we've either removed all the items we wanted to, or we can't
|
||||
// remove any more items without violating the constraint that all groups must
|
||||
// be left with at least one item
|
||||
while ( numItemsToRemove > 0 && numConsecutiveUmodifiableGroups < groupSizes.length ) {
|
||||
if ( groupSizes[currentGroupIndex] > 1 ) {
|
||||
groupSizes[currentGroupIndex]--;
|
||||
numItemsToRemove--;
|
||||
numConsecutiveUmodifiableGroups = 0;
|
||||
}
|
||||
else {
|
||||
numConsecutiveUmodifiableGroups++;
|
||||
}
|
||||
|
||||
currentGroupIndex = (currentGroupIndex + 1) % groupSizes.length;
|
||||
}
|
||||
|
||||
// Now we actually go through and reduce each group to its new count as specified in groupSizes
|
||||
currentGroupIndex = 0;
|
||||
for ( T group : groups ) {
|
||||
downsampleOneGroup(group, groupSizes[currentGroupIndex]);
|
||||
currentGroupIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
private void downsampleOneGroup( T group, int numItemsToKeep ) {
|
||||
if ( numItemsToKeep >= group.size() ) {
|
||||
return;
|
||||
}
|
||||
|
||||
numDiscardedItems += group.size() - numItemsToKeep;
|
||||
|
||||
BitSet itemsToKeep = new BitSet(group.size());
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(group.size(), numItemsToKeep) ) {
|
||||
itemsToKeep.set(selectedIndex);
|
||||
}
|
||||
|
||||
int currentIndex = 0;
|
||||
|
||||
// If our group is a linked list, we can remove the desired items in a single O(n) pass with an iterator
|
||||
if ( group instanceof LinkedList ) {
|
||||
Iterator iter = group.iterator();
|
||||
while ( iter.hasNext() ) {
|
||||
iter.next();
|
||||
|
||||
if ( ! itemsToKeep.get(currentIndex) ) {
|
||||
iter.remove();
|
||||
}
|
||||
|
||||
currentIndex++;
|
||||
}
|
||||
}
|
||||
// If it's not a linked list, it's more efficient to copy the desired items into a new list and back rather
|
||||
// than suffer O(n^2) of item shifting
|
||||
else {
|
||||
List<E> keptItems = new ArrayList<E>(numItemsToKeep);
|
||||
|
||||
for ( E item : group ) {
|
||||
if ( itemsToKeep.get(currentIndex) ) {
|
||||
keptItems.add(item);
|
||||
}
|
||||
currentIndex++;
|
||||
}
|
||||
group.clear();
|
||||
group.addAll(keptItems);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMRecordComparator;
|
||||
import net.sf.samtools.SAMRecordCoordinateComparator;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* StingSAMIterator wrapper around our generic reads downsampler interface
|
||||
* that downsamples reads for each sample independently, and then re-assembles
|
||||
* the reads back into a single merged stream.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class PerSampleDownsamplingReadsIterator implements StingSAMIterator {
|
||||
|
||||
private StingSAMIterator nestedSAMIterator;
|
||||
private ReadsDownsamplerFactory<SAMRecord> downsamplerFactory;
|
||||
private Map<String, ReadsDownsampler<SAMRecord>> perSampleDownsamplers;
|
||||
private PriorityQueue<SAMRecord> orderedDownsampledReadsCache;
|
||||
private SAMRecord nextRead = null;
|
||||
private SAMRecordComparator readComparator = new SAMRecordCoordinateComparator();
|
||||
private SAMRecord earliestPendingRead = null;
|
||||
private ReadsDownsampler<SAMRecord> earliestPendingDownsampler = null;
|
||||
|
||||
// Initial size of our cache of finalized reads
|
||||
private static final int DOWNSAMPLED_READS_INITIAL_CACHE_SIZE = 4096;
|
||||
|
||||
// The number of positional changes that can occur in the read stream before all downsamplers
|
||||
// should be informed of the current position (guards against samples with relatively sparse reads
|
||||
// getting stuck in a pending state):
|
||||
private static final int DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL = 3; // TODO: experiment with this value
|
||||
|
||||
/**
|
||||
* @param iter wrapped iterator from which this iterator will pull reads
|
||||
* @param downsamplerFactory factory used to create new downsamplers as needed
|
||||
*/
|
||||
public PerSampleDownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsamplerFactory<SAMRecord> downsamplerFactory ) {
|
||||
nestedSAMIterator = iter;
|
||||
this.downsamplerFactory = downsamplerFactory;
|
||||
perSampleDownsamplers = new HashMap<String, ReadsDownsampler<SAMRecord>>();
|
||||
orderedDownsampledReadsCache = new PriorityQueue<SAMRecord>(DOWNSAMPLED_READS_INITIAL_CACHE_SIZE, readComparator);
|
||||
|
||||
advanceToNextRead();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return nextRead != null;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if ( nextRead == null ) {
|
||||
throw new NoSuchElementException("next() called when there are no more items");
|
||||
}
|
||||
|
||||
SAMRecord toReturn = nextRead;
|
||||
advanceToNextRead();
|
||||
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
private void advanceToNextRead() {
|
||||
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
|
||||
nextRead = null;
|
||||
}
|
||||
else {
|
||||
nextRead = orderedDownsampledReadsCache.poll();
|
||||
}
|
||||
}
|
||||
|
||||
private boolean readyToReleaseReads() {
|
||||
if ( orderedDownsampledReadsCache.isEmpty() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return earliestPendingRead == null ||
|
||||
readComparator.compare(orderedDownsampledReadsCache.peek(), earliestPendingRead) <= 0;
|
||||
}
|
||||
|
||||
private void updateEarliestPendingRead( ReadsDownsampler<SAMRecord> currentDownsampler ) {
|
||||
// If there is no recorded earliest pending read and this downsampler has pending items,
|
||||
// then this downsampler's first pending item becomes the new earliest pending read:
|
||||
if ( earliestPendingRead == null && currentDownsampler.hasPendingItems() ) {
|
||||
earliestPendingRead = currentDownsampler.peekPending();
|
||||
earliestPendingDownsampler = currentDownsampler;
|
||||
}
|
||||
// In all other cases, we only need to update the earliest pending read when the downsampler
|
||||
// associated with it experiences a change in its pending reads, since by assuming a sorted
|
||||
// read stream we're assured that each downsampler's earliest pending read will only increase
|
||||
// in genomic position over time.
|
||||
//
|
||||
// TODO: An occasional O(samples) linear search seems like a better option than keeping the downsamplers
|
||||
// TODO: sorted by earliest pending read, which would cost at least O(total_reads * (samples + log(samples))),
|
||||
// TODO: but need to verify this empirically.
|
||||
else if ( currentDownsampler == earliestPendingDownsampler &&
|
||||
(! currentDownsampler.hasPendingItems() || readComparator.compare(currentDownsampler.peekPending(), earliestPendingRead) != 0) ) {
|
||||
|
||||
earliestPendingRead = null;
|
||||
earliestPendingDownsampler = null;
|
||||
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
|
||||
if ( perSampleDownsampler.hasPendingItems() &&
|
||||
(earliestPendingRead == null || readComparator.compare(perSampleDownsampler.peekPending(), earliestPendingRead) < 0) ) {
|
||||
|
||||
earliestPendingRead = perSampleDownsampler.peekPending();
|
||||
earliestPendingDownsampler = perSampleDownsampler;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean fillDownsampledReadsCache() {
|
||||
SAMRecord prevRead = null;
|
||||
int numPositionalChanges = 0;
|
||||
|
||||
// Continue submitting reads to the per-sample downsamplers until the read at the top of the priority queue
|
||||
// can be released without violating global sort order
|
||||
while ( nestedSAMIterator.hasNext() && ! readyToReleaseReads() ) {
|
||||
SAMRecord read = nestedSAMIterator.next();
|
||||
String sampleName = read.getReadGroup() != null ? read.getReadGroup().getSample() : null;
|
||||
|
||||
ReadsDownsampler<SAMRecord> thisSampleDownsampler = perSampleDownsamplers.get(sampleName);
|
||||
if ( thisSampleDownsampler == null ) {
|
||||
thisSampleDownsampler = downsamplerFactory.newInstance();
|
||||
perSampleDownsamplers.put(sampleName, thisSampleDownsampler);
|
||||
}
|
||||
|
||||
thisSampleDownsampler.submit(read);
|
||||
updateEarliestPendingRead(thisSampleDownsampler);
|
||||
|
||||
if ( prevRead != null && prevRead.getAlignmentStart() != read.getAlignmentStart() ) {
|
||||
numPositionalChanges++;
|
||||
}
|
||||
|
||||
// Periodically inform all downsamplers of the current position in the read stream. This is
|
||||
// to prevent downsamplers for samples with sparser reads than others from getting stuck too
|
||||
// long in a pending state.
|
||||
if ( numPositionalChanges > 0 && numPositionalChanges % DOWNSAMPLER_POSITIONAL_UPDATE_INTERVAL == 0 ) {
|
||||
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
|
||||
perSampleDownsampler.signalNoMoreReadsBefore(read);
|
||||
updateEarliestPendingRead(perSampleDownsampler);
|
||||
}
|
||||
}
|
||||
|
||||
prevRead = read;
|
||||
}
|
||||
|
||||
if ( ! nestedSAMIterator.hasNext() ) {
|
||||
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
|
||||
perSampleDownsampler.signalEndOfInput();
|
||||
}
|
||||
earliestPendingRead = null;
|
||||
earliestPendingDownsampler = null;
|
||||
}
|
||||
|
||||
for ( ReadsDownsampler<SAMRecord> perSampleDownsampler : perSampleDownsamplers.values() ) {
|
||||
if ( perSampleDownsampler.hasFinalizedItems() ) {
|
||||
orderedDownsampledReadsCache.addAll(perSampleDownsampler.consumeFinalizedItems());
|
||||
}
|
||||
}
|
||||
|
||||
return readyToReleaseReads();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
nestedSAMIterator.close();
|
||||
}
|
||||
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,259 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private int targetCoverage;
|
||||
|
||||
private ReservoirDownsampler<T> reservoir;
|
||||
|
||||
private int currentContigIndex;
|
||||
|
||||
private int currentAlignmentStart;
|
||||
|
||||
private LinkedList<PositionalReadGrouping> pendingReads;
|
||||
|
||||
private ArrayList<T> finalizedReads;
|
||||
|
||||
public PositionalDownsampler ( int targetCoverage ) {
|
||||
this.targetCoverage = targetCoverage;
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit ( T newRead ) {
|
||||
if ( readIsPastCurrentPosition(newRead) ) {
|
||||
updateAndDownsamplePendingReads();
|
||||
}
|
||||
|
||||
reservoir.submit(newRead);
|
||||
updateCurrentPosition(newRead);
|
||||
}
|
||||
|
||||
public void submit ( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
return finalizedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
List<T> toReturn = finalizedReads;
|
||||
finalizedReads = new ArrayList<T>();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return pendingReads.size() > 0;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
updateAndDownsamplePendingReads();
|
||||
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
group.finalizeAllActiveReads();
|
||||
finalizedReads.addAll(group.getFinalizedReads());
|
||||
}
|
||||
|
||||
pendingReads.clear();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir = new ReservoirDownsampler<T>(targetCoverage);
|
||||
pendingReads = new LinkedList<PositionalReadGrouping>();
|
||||
finalizedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
private void updateCurrentPosition ( T read ) {
|
||||
currentContigIndex = read.getReferenceIndex();
|
||||
currentAlignmentStart = read.getAlignmentStart();
|
||||
}
|
||||
|
||||
private boolean readIsPastCurrentPosition ( T read ) {
|
||||
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
|
||||
}
|
||||
|
||||
private void updateAndDownsamplePendingReads() {
|
||||
finalizeOutOfScopeReads();
|
||||
|
||||
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
|
||||
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
|
||||
|
||||
downsampleOverlappingGroups();
|
||||
}
|
||||
|
||||
private void finalizeOutOfScopeReads() {
|
||||
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
|
||||
boolean noPrecedingUnfinalizedGroups = true;
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
PositionalReadGrouping currentGroup = iter.next();
|
||||
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
|
||||
|
||||
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
|
||||
iter.remove();
|
||||
finalizedReads.addAll(currentGroup.getFinalizedReads());
|
||||
}
|
||||
else {
|
||||
noPrecedingUnfinalizedGroups = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void downsampleOverlappingGroups() {
|
||||
int[] groupReadCounts = new int[pendingReads.size()];
|
||||
int totalCoverage = 0;
|
||||
int numActiveGroups = 0;
|
||||
int currentGroup = 0;
|
||||
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
groupReadCounts[currentGroup] = group.numActiveReads();
|
||||
totalCoverage += groupReadCounts[currentGroup];
|
||||
|
||||
if ( groupReadCounts[currentGroup] > 0 ) {
|
||||
numActiveGroups++;
|
||||
}
|
||||
|
||||
currentGroup++;
|
||||
}
|
||||
|
||||
if ( totalCoverage <= targetCoverage ) {
|
||||
return;
|
||||
}
|
||||
|
||||
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
|
||||
currentGroup = 0;
|
||||
|
||||
while ( numReadsToRemove > 0 ) {
|
||||
if ( groupReadCounts[currentGroup] > 1 ) {
|
||||
groupReadCounts[currentGroup]--;
|
||||
numReadsToRemove--;
|
||||
}
|
||||
|
||||
currentGroup = (currentGroup + 1) % groupReadCounts.length;
|
||||
}
|
||||
|
||||
currentGroup = 0;
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
if ( ! group.isFinalized() ) {
|
||||
group.downsampleActiveReads(groupReadCounts[currentGroup]);
|
||||
}
|
||||
currentGroup++;
|
||||
}
|
||||
}
|
||||
|
||||
private class PositionalReadGrouping {
|
||||
private List<T> activeReads;
|
||||
private List<T> finalizedReads;
|
||||
|
||||
private int contig;
|
||||
private int alignmentStart;
|
||||
|
||||
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
|
||||
activeReads = new LinkedList<T>(reads);
|
||||
finalizedReads = new ArrayList<T>();
|
||||
this.contig = contig;
|
||||
this.alignmentStart = alignmentStart;
|
||||
}
|
||||
|
||||
public int numActiveReads() {
|
||||
return activeReads.size();
|
||||
}
|
||||
|
||||
public boolean isFinalized() {
|
||||
return activeReads.size() == 0;
|
||||
}
|
||||
|
||||
public List<T> getFinalizedReads() {
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
|
||||
if ( this.contig != contig ) {
|
||||
finalizeAllActiveReads();
|
||||
return;
|
||||
}
|
||||
|
||||
Iterator<T> iter = activeReads.iterator();
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
T read = iter.next();
|
||||
if ( read.getAlignmentEnd() < position ) {
|
||||
iter.remove();
|
||||
finalizedReads.add(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void finalizeAllActiveReads() {
|
||||
finalizedReads.addAll(activeReads);
|
||||
activeReads.clear();
|
||||
}
|
||||
|
||||
public void downsampleActiveReads( int numReadsToKeep ) {
|
||||
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
|
||||
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
|
||||
numReadsToKeep, activeReads.size()));
|
||||
}
|
||||
|
||||
BitSet itemsToKeep = new BitSet(activeReads.size());
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
|
||||
itemsToKeep.set(selectedIndex);
|
||||
}
|
||||
|
||||
int currentIndex = 0;
|
||||
Iterator<T> iter = activeReads.iterator();
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
T read = iter.next();
|
||||
|
||||
if ( ! itemsToKeep.get(currentIndex) ) {
|
||||
iter.remove();
|
||||
}
|
||||
|
||||
currentIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -33,8 +33,23 @@ import net.sf.samtools.SAMRecord;
|
|||
*/
|
||||
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
|
||||
|
||||
/*
|
||||
/**
|
||||
* Does this downsampler require that reads be fed to it in coordinate order?
|
||||
*
|
||||
* @return true if reads must be submitted to this downsampler in coordinate order, otherwise false
|
||||
*/
|
||||
public boolean requiresCoordinateSortOrder();
|
||||
|
||||
/**
|
||||
* Tell this downsampler that no more reads located before the provided read (according to
|
||||
* the sort order of the read stream) will be fed to it.
|
||||
*
|
||||
* Allows position-aware downsamplers to finalize pending reads earlier than they would
|
||||
* otherwise be able to, particularly when doing per-sample downsampling and reads for
|
||||
* certain samples are sparser than average.
|
||||
*
|
||||
* @param read the downsampler will assume that no reads located before this read will ever
|
||||
* be submitted to it in the future
|
||||
*/
|
||||
public void signalNoMoreReadsBefore( T read );
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,37 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* A ReadsDownsamplerFactory can be used to create an arbitrary number of instances of a particular
|
||||
* downsampler, all sharing the same construction parameters.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface ReadsDownsamplerFactory<T extends SAMRecord> {
|
||||
public ReadsDownsampler<T> newInstance();
|
||||
}
|
||||
|
|
@ -48,6 +48,14 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
|
||||
private int totalReadsSeen;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
/**
|
||||
* Construct a ReservoirDownsampler
|
||||
*
|
||||
* @param targetSampleSize Size of the reservoir used by this downsampler. Number of items retained
|
||||
* after downsampling will be min(totalReads, targetSampleSize)
|
||||
*/
|
||||
public ReservoirDownsampler ( int targetSampleSize ) {
|
||||
if ( targetSampleSize <= 0 ) {
|
||||
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
|
||||
|
|
@ -55,6 +63,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
|
||||
this.targetSampleSize = targetSampleSize;
|
||||
clear();
|
||||
reset();
|
||||
}
|
||||
|
||||
public void submit ( T newRead ) {
|
||||
|
|
@ -68,6 +77,7 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
if ( randomSlot < targetSampleSize ) {
|
||||
reservoir.set(randomSlot, newRead);
|
||||
}
|
||||
numDiscardedItems++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -77,11 +87,12 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
public boolean hasFinalizedItems() {
|
||||
return reservoir.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> downsampledItems = reservoir;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
|
|
@ -91,16 +102,36 @@ public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampl
|
|||
return false;
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return reservoir.isEmpty() ? null : reservoir.get(0);
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return null;
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir = new ArrayList<T>(targetSampleSize);
|
||||
totalReadsSeen = 0;
|
||||
totalReadsSeen = 0; // an internal stat used by the downsampling process, so not cleared by reset() below
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
// NO-OP
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Factory for creating ReservoirDownsamplers on demand
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ReservoirDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
|
||||
|
||||
private int targetSampleSize;
|
||||
|
||||
public ReservoirDownsamplerFactory( int targetSampleSize ) {
|
||||
this.targetSampleSize = targetSampleSize;
|
||||
}
|
||||
|
||||
public ReadsDownsampler<T> newInstance() {
|
||||
return new ReservoirDownsampler<T>(targetSampleSize);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,169 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Simple Positional Downsampler: Downsample each stack of reads at each alignment start to a size <= a target coverage
|
||||
* using a Reservoir downsampler. Stores only O(target coverage) reads in memory at any given time.
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SimplePositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private int targetCoverage;
|
||||
|
||||
private ReservoirDownsampler<T> reservoir;
|
||||
|
||||
private int currentContigIndex;
|
||||
|
||||
private int currentAlignmentStart;
|
||||
|
||||
private boolean positionEstablished;
|
||||
|
||||
private boolean unmappedReadsReached;
|
||||
|
||||
private ArrayList<T> finalizedReads;
|
||||
|
||||
private int numDiscardedItems;
|
||||
|
||||
/**
|
||||
* Construct a SimplePositionalDownsampler
|
||||
*
|
||||
* @param targetCoverage Maximum number of reads that may share any given alignment start position
|
||||
*/
|
||||
public SimplePositionalDownsampler( int targetCoverage ) {
|
||||
this.targetCoverage = targetCoverage;
|
||||
reservoir = new ReservoirDownsampler<T>(targetCoverage);
|
||||
finalizedReads = new ArrayList<T>();
|
||||
clear();
|
||||
reset();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
updatePositionalState(newRead);
|
||||
|
||||
if ( unmappedReadsReached ) { // don't downsample the unmapped reads at the end of the stream
|
||||
finalizedReads.add(newRead);
|
||||
}
|
||||
else {
|
||||
int reservoirPreviouslyDiscardedItems = reservoir.getNumberOfDiscardedItems();
|
||||
reservoir.submit(newRead);
|
||||
numDiscardedItems += reservoir.getNumberOfDiscardedItems() - reservoirPreviouslyDiscardedItems;
|
||||
}
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasFinalizedItems() {
|
||||
return finalizedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeFinalizedItems() {
|
||||
// pass by reference rather than make a copy, for speed
|
||||
List<T> toReturn = finalizedReads;
|
||||
finalizedReads = new ArrayList<T>();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return reservoir.hasFinalizedItems();
|
||||
}
|
||||
|
||||
public T peekFinalized() {
|
||||
return finalizedReads.isEmpty() ? null : finalizedReads.get(0);
|
||||
}
|
||||
|
||||
public T peekPending() {
|
||||
return reservoir.peekFinalized();
|
||||
}
|
||||
|
||||
public int getNumberOfDiscardedItems() {
|
||||
return numDiscardedItems;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
finalizeReservoir();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir.clear();
|
||||
reservoir.reset();
|
||||
finalizedReads.clear();
|
||||
positionEstablished = false;
|
||||
unmappedReadsReached = false;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
numDiscardedItems = 0;
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void signalNoMoreReadsBefore( T read ) {
|
||||
updatePositionalState(read);
|
||||
}
|
||||
|
||||
private void updatePositionalState( T newRead ) {
|
||||
if ( readIsPastCurrentPosition(newRead) ) {
|
||||
if ( reservoir.hasFinalizedItems() ) {
|
||||
finalizeReservoir();
|
||||
}
|
||||
|
||||
setCurrentPosition(newRead);
|
||||
|
||||
if ( newRead.getReadUnmappedFlag() ) {
|
||||
unmappedReadsReached = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void setCurrentPosition( T read ) {
|
||||
currentContigIndex = read.getReferenceIndex();
|
||||
currentAlignmentStart = read.getAlignmentStart();
|
||||
positionEstablished = true;
|
||||
}
|
||||
|
||||
private boolean readIsPastCurrentPosition( T read ) {
|
||||
return ! positionEstablished ||
|
||||
read.getReferenceIndex() > currentContigIndex ||
|
||||
read.getAlignmentStart() > currentAlignmentStart ||
|
||||
(read.getReadUnmappedFlag() && ! unmappedReadsReached);
|
||||
}
|
||||
|
||||
private void finalizeReservoir() {
|
||||
finalizedReads.addAll(reservoir.consumeFinalizedItems());
|
||||
reservoir.reset();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* Factory for creating SimplePositionalDownsamplers on demand
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class SimplePositionalDownsamplerFactory<T extends SAMRecord> implements ReadsDownsamplerFactory<T> {
|
||||
|
||||
private int targetCoverage;
|
||||
|
||||
public SimplePositionalDownsamplerFactory( int targetCoverage ) {
|
||||
this.targetCoverage = targetCoverage;
|
||||
}
|
||||
|
||||
public ReadsDownsampler<T> newInstance() {
|
||||
return new SimplePositionalDownsampler<T>(targetCoverage);
|
||||
}
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue