Merge remote-tracking branch 'unstable/master'
This commit is contained in:
commit
c011837147
136
ivy.xml
136
ivy.xml
|
|
@ -23,86 +23,90 @@
|
|||
-->
|
||||
|
||||
<ivy-module version="1.0">
|
||||
<info organisation="org.broadinstitute" module="Sting"/>
|
||||
<configurations defaultconfmapping="test->default">
|
||||
<conf name="default" description="the core dependencies for the GATK"/>
|
||||
<conf name="test" extends="default" description="external dependencies used for testing and metrics" />
|
||||
<conf name="scala" extends="default" description="the dependencies for scala"/>
|
||||
<conf name="queue" extends="scala" description="the dependencies for Queue"/>
|
||||
</configurations>
|
||||
<dependencies defaultconf="default">
|
||||
<dependency org="net.sf" name="sam" rev="latest.integration"/>
|
||||
<dependency org="net.sf" name="picard" rev="latest.integration"/>
|
||||
<dependency org="edu.mit.broad" name="picard-private-parts" rev="latest.integration"/>
|
||||
<info organisation="org.broadinstitute" module="Sting"/>
|
||||
<configurations defaultconfmapping="test->default">
|
||||
<conf name="default" description="the core dependencies for the GATK"/>
|
||||
<conf name="test" extends="default" description="external dependencies used for testing and metrics"/>
|
||||
<conf name="scala" extends="default" description="the dependencies for scala"/>
|
||||
<conf name="queue" extends="scala" description="the dependencies for Queue"/>
|
||||
</configurations>
|
||||
<dependencies defaultconf="default">
|
||||
<dependency org="net.sf" name="sam" rev="latest.integration"/>
|
||||
<dependency org="net.sf" name="picard" rev="latest.integration"/>
|
||||
<dependency org="edu.mit.broad" name="picard-private-parts" rev="latest.integration"/>
|
||||
|
||||
<!-- Tribble -->
|
||||
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
|
||||
<!-- Tribble -->
|
||||
<dependency org="org.broad" name="tribble" rev="latest.integration"/>
|
||||
|
||||
<dependency org="log4j" name="log4j" rev="1.2.15"/>
|
||||
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
|
||||
<dependency org="colt" name="colt" rev="1.2.0"/>
|
||||
<!-- <dependency org="jboss" name="javassist" rev="3.7.ga"/> -->
|
||||
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>
|
||||
<dependency org="org.apache.bcel" name="bcel" rev="5.2"/>
|
||||
<dependency org="log4j" name="log4j" rev="1.2.15"/>
|
||||
<dependency org="javax.mail" name="mail" rev="1.4.4"/>
|
||||
<dependency org="colt" name="colt" rev="1.2.0"/>
|
||||
<!-- <dependency org="jboss" name="javassist" rev="3.7.ga"/> -->
|
||||
<dependency org="org.simpleframework" name="simple-xml" rev="2.0.4"/>
|
||||
<dependency org="org.apache.bcel" name="bcel" rev="5.2"/>
|
||||
|
||||
<!-- Dependencies for reflections mvn repository -->
|
||||
<dependency org="org.reflections" name="reflections" rev="0.9.5-RC2"/>
|
||||
<!-- Dependencies for reflections mvn repository -->
|
||||
<dependency org="org.reflections" name="reflections" rev="0.9.5-RC2"/>
|
||||
|
||||
<!-- Matrix package from math.nist.gov -->
|
||||
<dependency org="gov.nist" name="Jama" rev="1.0.2"/>
|
||||
<!-- Matrix package from math.nist.gov -->
|
||||
<dependency org="gov.nist" name="Jama" rev="1.0.2"/>
|
||||
|
||||
<!-- Dependencies for the graph aligner -->
|
||||
<dependency org="org.jgrapht" name="jgrapht-jdk1.5" rev="0.7.3"/>
|
||||
<!-- Dependencies for the graph aligner -->
|
||||
<dependency org="org.jgrapht" name="jgrapht-jdk1.5" rev="0.7.3"/>
|
||||
|
||||
<!-- Dependencies for the html walker documention -->
|
||||
<dependency org="org.freemarker" name="freemarker" rev="2.3.18"/>
|
||||
|
||||
<!-- Commons Dependencies -->
|
||||
<dependency org="org.apache.commons" name="commons-email" rev="1.2"/>
|
||||
<dependency org="org.apache.commons" name="commons-jexl" rev="2.0"/>
|
||||
<dependency org="commons-lang" name="commons-lang" rev="2.5"/>
|
||||
<dependency org="commons-logging" name="commons-logging" rev="1.1.1"/>
|
||||
<dependency org="commons-io" name="commons-io" rev="2.1"/>
|
||||
<dependency org="org.apache.commons" name="commons-math" rev="2.2" />
|
||||
<!-- Dependencies for the html walker documention -->
|
||||
<dependency org="org.freemarker" name="freemarker" rev="2.3.18"/>
|
||||
|
||||
<!-- Lucene core utilities -->
|
||||
<!-- <dependency org="org.apache.lucene" name="lucene-core" rev="3.0.3"/> -->
|
||||
<!-- Commons Dependencies -->
|
||||
<dependency org="org.apache.commons" name="commons-email" rev="1.2"/>
|
||||
<dependency org="org.apache.commons" name="commons-jexl" rev="2.1.1"/>
|
||||
<dependency org="commons-lang" name="commons-lang" rev="2.5"/>
|
||||
<dependency org="commons-logging" name="commons-logging" rev="1.1.1"/>
|
||||
<dependency org="commons-io" name="commons-io" rev="2.1"/>
|
||||
<dependency org="org.apache.commons" name="commons-math" rev="2.2"/>
|
||||
|
||||
<!-- Dependencies for LSF, DRMAA, and other C libraries -->
|
||||
<dependency org="net.java.dev.jna" name="jna" rev="3.2.7"/>
|
||||
<!-- Lucene core utilities -->
|
||||
<!-- <dependency org="org.apache.lucene" name="lucene-core" rev="3.0.3"/> -->
|
||||
|
||||
<!-- Dependencies for amazon.com S3 support -->
|
||||
<dependency org="net.java.dev.jets3t" name="jets3t" rev="0.8.1"/>
|
||||
<!-- Dependencies for LSF, DRMAA, and other C libraries -->
|
||||
<dependency org="net.java.dev.jna" name="jna" rev="3.2.7"/>
|
||||
|
||||
<!-- Dependencies for GridEngine -->
|
||||
<dependency org="net.sf.gridscheduler" name="drmaa" rev="latest.integration"/>
|
||||
<!-- Dependencies for amazon.com S3 support -->
|
||||
<dependency org="net.java.dev.jets3t" name="jets3t" rev="0.8.1"/>
|
||||
|
||||
<!-- Scala dependancies -->
|
||||
<dependency org="org.scala-lang" name="scala-compiler" rev="2.8.1"/>
|
||||
<dependency org="org.scala-lang" name="scala-library" rev="2.8.1"/>
|
||||
<!-- Dependencies for GridEngine -->
|
||||
<dependency org="net.sf.gridscheduler" name="drmaa" rev="latest.integration"/>
|
||||
|
||||
<!-- testing and evaluation dependencies -->
|
||||
<dependency org="org.testng" name="testng" rev="5.14.1" conf="test" />
|
||||
<dependency org="net.sourceforge.findbugs" name="findbugs" rev="1.3.2" conf="test"/>
|
||||
<dependency org="net.sourceforge.findbugs" name="findbugs-ant" rev="1.3.2" conf="test"/>
|
||||
<dependency org="net.sourceforge.findbugs" name="annotations" rev="1.3.2" conf="test"/>
|
||||
<dependency org="net.sourceforge.findbugs" name="jsr305" rev="1.3.2" conf="test"/>
|
||||
<dependency org="com.google.code.caliper" name="caliper" rev="1.0-SNAPSHOT" conf="test" />
|
||||
<!-- Scala dependancies -->
|
||||
<dependency org="org.scala-lang" name="scala-compiler" rev="2.8.1"/>
|
||||
<dependency org="org.scala-lang" name="scala-library" rev="2.8.1"/>
|
||||
|
||||
<!-- Contracts for Java and dependencies -->
|
||||
<dependency org="com.google.code.cofoja" name="cofoja" rev="1.0-20110609" />
|
||||
<dependency org="asm" name="asm-all" rev="3.3.1" />
|
||||
<!-- testing and evaluation dependencies -->
|
||||
<dependency org="org.testng" name="testng" rev="5.14.1" conf="test"/>
|
||||
<dependency org="org.uncommons" name="reportng" rev="1.1.2" conf="test"/>
|
||||
<dependency org="com.google.code.caliper" name="caliper" rev="1.0-SNAPSHOT" conf="test"/>
|
||||
|
||||
<!-- POI, for reading pipeline files -->
|
||||
<dependency org="org.apache.poi" name="poi" rev="3.8-beta3" />
|
||||
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3" />
|
||||
<!-- Contracts for Java and dependencies -->
|
||||
<dependency org="com.google.code.cofoja" name="cofoja" rev="1.0-20110609"/>
|
||||
<dependency org="asm" name="asm-all" rev="3.3.1"/>
|
||||
|
||||
<!-- snpEff annotator for pipelines -->
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.5" />
|
||||
<!-- POI, for reading pipeline files -->
|
||||
<dependency org="org.apache.poi" name="poi" rev="3.8-beta3"/>
|
||||
<dependency org="org.apache.poi" name="poi-ooxml" rev="3.8-beta3"/>
|
||||
|
||||
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
|
||||
<exclude org="javax.servlet" />
|
||||
<exclude org="javax.jms" />
|
||||
<exclude org="com.sun.*" />
|
||||
</dependencies>
|
||||
<!-- snpEff annotator for pipelines -->
|
||||
<dependency org="net.sf.snpeff" name="snpeff" rev="2.0.5"/>
|
||||
|
||||
<!-- MongoDB for the GXDB project -->
|
||||
<dependency org="org.mongodb" name="mongo-java-driver" rev="2.7.3"/>
|
||||
|
||||
<!-- GSON and HTTP for talking to the REST API on Vanilla Forums -->
|
||||
<dependency org="com.google.code.gson" name="gson" rev="2.2.2"/>
|
||||
<dependency org="org.apache.httpcomponents" name="httpclient" rev="4.1.1"/>
|
||||
|
||||
<!-- Exclude dependencies on sun libraries where the downloads aren't available but included in the jvm. -->
|
||||
<exclude org="javax.servlet"/>
|
||||
<exclude org="javax.jms"/>
|
||||
<exclude org="com.sun.*"/>
|
||||
</dependencies>
|
||||
</ivy-module>
|
||||
|
|
|
|||
Binary file not shown.
|
|
@ -1,4 +1,4 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.recalibration;
|
||||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
|
|
@ -25,31 +25,10 @@ package org.broadinstitute.sting.gatk.walkers.recalibration;
|
|||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: Mar 22, 2012
|
||||
*
|
||||
* Object that holds the empirical quality and estimated reported quality values for on-the-fly recalibration. This is a simplification of the RecalDatum object
|
||||
*/
|
||||
import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
|
||||
|
||||
public class EmpiricalQual {
|
||||
public class DummyProtectedClass implements ProtectedPackageSource {
|
||||
|
||||
private double estimatedQReported; // estimated reported quality score based on combined data's individual q-reporteds and number of observations
|
||||
private double empiricalQuality; // the empirical quality for datums that have been collapsed together (by read group and reported quality, for example)
|
||||
|
||||
private EmpiricalQual() {}
|
||||
|
||||
public EmpiricalQual(final double estimatedQReported, final double empiricalQuality) {
|
||||
this.estimatedQReported = estimatedQReported;
|
||||
this.empiricalQuality = empiricalQuality;
|
||||
}
|
||||
|
||||
public final double getEstimatedQReported() {
|
||||
return estimatedQReported;
|
||||
}
|
||||
|
||||
public final double getEmpiricalQuality() {
|
||||
return empiricalQuality;
|
||||
}
|
||||
}
|
||||
// THIS CLASS IS USED JUST SO THAT WE CAN TEST WHETHER WE ARE USING THE LITE OR FULL VERSION OF THE GATK
|
||||
// **** DO NOT REMOVE! ****
|
||||
}
|
||||
|
|
@ -0,0 +1,99 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
|
||||
import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
|
||||
|
||||
public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource {
|
||||
|
||||
// optimizations: don't reallocate an array each time
|
||||
private byte[] tempQualArray;
|
||||
private boolean[] tempErrorArray;
|
||||
|
||||
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) {
|
||||
super.initialize(covariates, recalibrationTables);
|
||||
tempQualArray = new byte[EventType.values().length];
|
||||
tempErrorArray = new boolean[EventType.values().length];
|
||||
}
|
||||
|
||||
/**
|
||||
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference
|
||||
* Using the list of covariate values as a key, pick out the RecalDatum and increment,
|
||||
* adding one to the number of observations and potentially one to the number of mismatches for all three
|
||||
* categories (mismatches, insertions and deletions).
|
||||
*
|
||||
* @param pileupElement The pileup element to update
|
||||
* @param refBase The reference base at this locus
|
||||
*/
|
||||
public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) {
|
||||
final int offset = pileupElement.getOffset();
|
||||
final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead());
|
||||
|
||||
tempQualArray[EventType.BASE_SUBSTITUTION.index] = pileupElement.getQual();
|
||||
tempErrorArray[EventType.BASE_SUBSTITUTION.index] = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase);
|
||||
tempQualArray[EventType.BASE_INSERTION.index] = pileupElement.getBaseInsertionQual();
|
||||
tempErrorArray[EventType.BASE_INSERTION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterInsertion() : pileupElement.isBeforeInsertion();
|
||||
tempQualArray[EventType.BASE_DELETION.index] = pileupElement.getBaseDeletionQual();
|
||||
tempErrorArray[EventType.BASE_DELETION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterDeletedBase() : pileupElement.isBeforeDeletedBase();
|
||||
|
||||
for (final EventType eventType : EventType.values()) {
|
||||
final int[] keys = readCovariates.getKeySet(offset, eventType);
|
||||
final int eventIndex = eventType.index;
|
||||
final byte qual = tempQualArray[eventIndex];
|
||||
final boolean isError = tempErrorArray[eventIndex];
|
||||
|
||||
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE);
|
||||
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex);
|
||||
final RecalDatum rgThisDatum = createDatumObject(qual, isError);
|
||||
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it
|
||||
rgRecalTable.put(rgThisDatum, keys[0], eventIndex);
|
||||
else
|
||||
rgPreviousDatum.combine(rgThisDatum);
|
||||
|
||||
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE);
|
||||
final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex);
|
||||
if (qualPreviousDatum == null)
|
||||
qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex);
|
||||
else
|
||||
qualPreviousDatum.increment(isError);
|
||||
|
||||
for (int i = 2; i < covariates.length; i++) {
|
||||
if (keys[i] < 0)
|
||||
continue;
|
||||
final NestedIntegerArray<RecalDatum> covRecalTable = recalibrationTables.getTable(i);
|
||||
final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex);
|
||||
if (covPreviousDatum == null)
|
||||
covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex);
|
||||
else
|
||||
covPreviousDatum.increment(isError);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base.
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 6/15/12
|
||||
*/
|
||||
public class BaseAndQualsCounts extends BaseCounts {
|
||||
private final Map<BaseIndex, Long> sumInsertionQuals;
|
||||
private final Map<BaseIndex, Long> sumDeletionQuals;
|
||||
|
||||
public BaseAndQualsCounts() {
|
||||
super();
|
||||
this.sumInsertionQuals = new HashMap<BaseIndex, Long>();
|
||||
this.sumDeletionQuals = new HashMap<BaseIndex, Long>();
|
||||
for (BaseIndex i : BaseIndex.values()) {
|
||||
sumInsertionQuals.put(i, 0L);
|
||||
sumDeletionQuals.put(i, 0L);
|
||||
}
|
||||
}
|
||||
|
||||
public void incr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
super.incr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
sumInsertionQuals.put(i, sumInsertionQuals.get(i) + insQual);
|
||||
sumDeletionQuals.put(i, sumDeletionQuals.get(i) + delQual);
|
||||
}
|
||||
}
|
||||
|
||||
public void decr(byte base, byte baseQual, byte insQual, byte delQual) {
|
||||
super.decr(base, baseQual);
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // do not allow Ns
|
||||
sumInsertionQuals.put(i, sumInsertionQuals.get(i) - insQual);
|
||||
sumDeletionQuals.put(i, sumDeletionQuals.get(i) - delQual);
|
||||
}
|
||||
}
|
||||
|
||||
public byte averageInsertionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumInsertionQuals);
|
||||
}
|
||||
|
||||
public byte averageDeletionQualsOfMostCommonBase() {
|
||||
return getGenericAverageQualOfMostCommonBase(sumDeletionQuals);
|
||||
}
|
||||
|
||||
private byte getGenericAverageQualOfMostCommonBase(Map<BaseIndex, Long> sumQuals) {
|
||||
BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts());
|
||||
return (byte) (sumQuals.get(base) / getCount(base));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,223 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An object to keep track of the number of occurences of each base and it's quality.
|
||||
*
|
||||
* User: depristo
|
||||
* Date: 4/8/11
|
||||
* Time: 2:55 PM
|
||||
*/
|
||||
|
||||
public class BaseCounts {
|
||||
public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N;
|
||||
public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte();
|
||||
|
||||
private final Map<BaseIndex, Integer> counts; // keeps track of the base counts
|
||||
private final Map<BaseIndex, Long> sumQuals; // keeps track of the quals of each base
|
||||
|
||||
public BaseCounts() {
|
||||
counts = new EnumMap<BaseIndex, Integer>(BaseIndex.class);
|
||||
sumQuals = new EnumMap<BaseIndex, Long>(BaseIndex.class);
|
||||
for (BaseIndex i : BaseIndex.values()) {
|
||||
counts.put(i, 0);
|
||||
sumQuals.put(i, 0L);
|
||||
}
|
||||
}
|
||||
|
||||
public static BaseCounts createWithCounts(int[] countsACGT) {
|
||||
BaseCounts baseCounts = new BaseCounts();
|
||||
baseCounts.counts.put(BaseIndex.A, countsACGT[0]);
|
||||
baseCounts.counts.put(BaseIndex.C, countsACGT[1]);
|
||||
baseCounts.counts.put(BaseIndex.G, countsACGT[2]);
|
||||
baseCounts.counts.put(BaseIndex.T, countsACGT[3]);
|
||||
return baseCounts;
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void add(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) + other.counts.get(i));
|
||||
}
|
||||
|
||||
@Requires("other != null")
|
||||
public void sub(BaseCounts other) {
|
||||
for (BaseIndex i : BaseIndex.values())
|
||||
counts.put(i, counts.get(i) - other.counts.get(i));
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
|
||||
public void incr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) + 1);
|
||||
sumQuals.put(i, sumQuals.get(i) + qual);
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
}
|
||||
|
||||
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
|
||||
public void decr(byte base, byte qual) {
|
||||
BaseIndex i = BaseIndex.byteToBase(base);
|
||||
if (i != null) { // no Ns
|
||||
counts.put(i, counts.get(i) - 1);
|
||||
sumQuals.put(i, sumQuals.get(i) - qual);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(byte base) {
|
||||
return getCount(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int getCount(BaseIndex base) {
|
||||
return counts.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(byte base) {
|
||||
return getSumQuals(BaseIndex.byteToBase(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long getSumQuals(BaseIndex base) {
|
||||
return sumQuals.get(base);
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(byte base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQuals(BaseIndex base) {
|
||||
return (byte) (getSumQuals(base) / getCount(base));
|
||||
}
|
||||
|
||||
public byte baseWithMostCounts() {
|
||||
return baseIndexWithMostCounts().getByte();
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int countOfMostCommonBase() {
|
||||
return counts.get(baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public long sumQualsOfMostCommonBase() {
|
||||
return sumQuals.get(baseIndexWithMostCounts());
|
||||
}
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public byte averageQualsOfMostCommonBase() {
|
||||
return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase());
|
||||
}
|
||||
|
||||
|
||||
@Ensures("result >= 0")
|
||||
public int totalCount() {
|
||||
int sum = 0;
|
||||
for (int c : counts.values())
|
||||
sum += c;
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
* @param base
|
||||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(byte base) {
|
||||
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a base , it returns the proportional count of this base compared to all other bases
|
||||
*
|
||||
* @param baseIndex
|
||||
* @return the proportion of this base over all other bases
|
||||
*/
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportion(BaseIndex baseIndex) {
|
||||
int total = totalCount();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
return (double) counts.get(baseIndex) / totalCount();
|
||||
}
|
||||
|
||||
|
||||
@Ensures("result != null")
|
||||
public String toString() {
|
||||
StringBuilder b = new StringBuilder();
|
||||
for (Map.Entry<BaseIndex, Integer> elt : counts.entrySet()) {
|
||||
b.append(elt.toString()).append("=").append(elt.getValue()).append(",");
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCounts() {
|
||||
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex i : counts.keySet())
|
||||
if (counts.get(i) > counts.get(maxI))
|
||||
maxI = i;
|
||||
return maxI;
|
||||
}
|
||||
|
||||
@Ensures("result != null")
|
||||
public BaseIndex baseIndexWithMostCountsWithoutIndels() {
|
||||
BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS;
|
||||
for (BaseIndex index : counts.keySet())
|
||||
if (index.isNucleotide() && counts.get(index) > counts.get(mostCounts))
|
||||
mostCounts = index;
|
||||
return mostCounts;
|
||||
}
|
||||
|
||||
@Ensures("result >=0")
|
||||
public int totalCountWithoutIndels() {
|
||||
int sum = 0;
|
||||
for (BaseIndex index : counts.keySet())
|
||||
if (index.isNucleotide())
|
||||
sum += counts.get(index);
|
||||
return sum;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the proportional count of a base compared to all other bases except indels (I and D)
|
||||
*
|
||||
* @param index
|
||||
* @return the proportion of this base over all other bases except indels
|
||||
*/
|
||||
@Requires("index.isNucleotide()")
|
||||
@Ensures({"result >=0.0", "result<= 1.0"})
|
||||
public double baseCountProportionWithoutIndels(BaseIndex index) {
|
||||
int total = totalCountWithoutIndels();
|
||||
if (total == 0)
|
||||
return 0.0;
|
||||
return (double) counts.get(index) / totalCountWithoutIndels();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
/**
|
||||
* Simple byte / base index conversions
|
||||
*
|
||||
*
|
||||
* @author carneiro
|
||||
* @since 8/26/11
|
||||
*/
|
||||
public enum BaseIndex {
|
||||
A ( 'A', 0 ),
|
||||
C ( 'C', 1 ),
|
||||
G ( 'G', 2 ),
|
||||
T ( 'T', 3 ),
|
||||
D ( 'D', 4 ),
|
||||
I ( 'I', 5 ), // insertion to the right of the base
|
||||
N ( 'N', 6 );
|
||||
|
||||
final byte b;
|
||||
final int index;
|
||||
|
||||
public byte getByte() { return b; }
|
||||
|
||||
private BaseIndex(char base, int index) {
|
||||
this.b = (byte)base;
|
||||
this.index = index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a byte representation of a base to BaseIndex
|
||||
*
|
||||
* @param base the byte representation of the base
|
||||
* @return the BaseIndex representation of the base;
|
||||
*/
|
||||
public static BaseIndex byteToBase(final byte base) {
|
||||
switch (base) {
|
||||
case 'A':
|
||||
case 'a':
|
||||
return A;
|
||||
case 'C':
|
||||
case 'c':
|
||||
return C;
|
||||
case 'G':
|
||||
case 'g':
|
||||
return G;
|
||||
case 'T':
|
||||
case 't':
|
||||
return T;
|
||||
case 'D':
|
||||
case 'd':
|
||||
case '-':
|
||||
return D;
|
||||
case 'I':
|
||||
case 'i':
|
||||
return I;
|
||||
case 'N':
|
||||
case 'n':
|
||||
return N;
|
||||
default: return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Definition of a nucleotide for the BaseIndex is anything that has been read as a base
|
||||
* by the machine (A,C,G,T), even if it couldn't tell which base it was, but it knows
|
||||
* there is a base there (N).
|
||||
*
|
||||
* @return whether or not it is a nucleotide, given the definition above
|
||||
*/
|
||||
public boolean isNucleotide() {
|
||||
return this == A || this == C || this == G || this == T || this == N;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not this base is an insertion or a deletion
|
||||
*
|
||||
* @return true for I or D, false otherwise
|
||||
*/
|
||||
public boolean isIndel() {
|
||||
return this == D || this == I;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,182 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.DuplicateReadFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.FailsVendorQualityCheckFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.NotPrimaryAlignmentFilter;
|
||||
import org.broadinstitute.sting.gatk.filters.UnmappedReadFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Given two BAMs with different read groups, it compares them based on ReduceReads metrics.
|
||||
* <p>
|
||||
* This is a test walker used for asserting that the ReduceReads procedure is not making blatant mistakes when compressing bam files.
|
||||
* </p>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Two BAM files (using -I) with different read group IDs
|
||||
* </p>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* [Output description]
|
||||
* </p>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T $WalkerName
|
||||
* </pre>
|
||||
*
|
||||
* @author carneiro
|
||||
* @since 10/30/11
|
||||
*/
|
||||
|
||||
@ReadFilters({UnmappedReadFilter.class,NotPrimaryAlignmentFilter.class,DuplicateReadFilter.class,FailsVendorQualityCheckFilter.class})
|
||||
public class CompareBAM extends LocusWalker<Map<CompareBAM.TestName, Boolean>, CompareBAM.TestResults> {
|
||||
@Argument(required = true, shortName = "rr", fullName = "reduced_readgroup", doc = "The read group ID corresponding to the compressed BAM being tested") public String reducedReadGroupID;
|
||||
@Argument(required = false, shortName = "teq", fullName = "test_equal_bases", doc = "Test if the bases marked as '=' are indeed ref bases.") public boolean TEST_EQUAL_BASES = false;
|
||||
@Argument(required = false, shortName = "tbc", fullName = "test_base_counts", doc = "Test if the base counts tag in consensus reads are accurate.") public boolean TEST_BASE_COUNTS = false;
|
||||
@Argument(required = false, shortName = "mbq", fullName = "min_base_qual", doc = "Minimum base quality to be considered.") public int MIN_BASE_QUAL = 20;
|
||||
@Argument(required = false, shortName = "mmq", fullName = "min_mapping_qual", doc = "Minimum mapping quality to be considered.") public int MIN_MAPPING_QUAL = 20;
|
||||
|
||||
|
||||
@Override
|
||||
public Map<TestName, Boolean> map (RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
Map<TestName, Boolean> result = new HashMap<TestName, Boolean>();
|
||||
|
||||
if (TEST_EQUAL_BASES) result.put(TestName.EQUAL_BASES, testEqualBases(ref, context));
|
||||
if (TEST_BASE_COUNTS) result.put(TestName.BASE_COUNTS, testBaseCounts(ref, context));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TestResults reduceInit () {
|
||||
TestResults sum = new TestResults(); // a fresh new TestResults object to sum up the results of every object passed by MAP.
|
||||
|
||||
if (TEST_EQUAL_BASES) sum.createTest(TestName.EQUAL_BASES);
|
||||
if (TEST_BASE_COUNTS) sum.createTest(TestName.BASE_COUNTS);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TestResults reduce (Map<TestName,Boolean> mapResult, TestResults sum) {
|
||||
for (TestName test : mapResult.keySet()) {
|
||||
if (mapResult.get(test))
|
||||
sum.reportSuccess(test);
|
||||
else
|
||||
sum.reportFailed(test);
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
public void onTraversalDone (TestResults finalResults) {
|
||||
finalResults.report();
|
||||
}
|
||||
|
||||
private boolean testEqualBases (ReferenceContext ref, AlignmentContext context) {
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean testBaseCounts (ReferenceContext ref, AlignmentContext context) {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public enum TestName {
|
||||
EQUAL_BASES ("testEqualBases"),
|
||||
BASE_COUNTS ("testBaseCounts");
|
||||
|
||||
private String testName;
|
||||
|
||||
TestName(String testName) {
|
||||
this.testName = testName;
|
||||
}
|
||||
|
||||
public String getTestName() {
|
||||
return testName;
|
||||
}
|
||||
}
|
||||
|
||||
public class TestResults {
|
||||
private Map<TestName, TestOutcome> testStats = new HashMap<TestName, TestOutcome>();
|
||||
|
||||
public void createTest (TestName test) {
|
||||
testStats.put(test, new TestOutcome());
|
||||
}
|
||||
|
||||
public void reportSuccess(TestName test) {
|
||||
if (testStats.containsKey(test))
|
||||
testStats.get(test).incPassed();
|
||||
else
|
||||
throw new ReviewedStingException("No such test: " + test);
|
||||
}
|
||||
|
||||
public void reportFailed(TestName test) {
|
||||
if (testStats.containsKey(test))
|
||||
testStats.get(test).incFailed();
|
||||
else
|
||||
throw new ReviewedStingException("No such test: " + test);
|
||||
}
|
||||
|
||||
public void report() {
|
||||
System.out.println();
|
||||
System.out.println(String.format("%20s\tPASS\tFAIL", ""));
|
||||
for (TestName test : testStats.keySet())
|
||||
System.out.println(String.format("%20s\t%d\t%d", test.getTestName(), testStats.get(test).getPassed(), testStats.get(test).getFailed()));
|
||||
System.out.println();
|
||||
}
|
||||
}
|
||||
|
||||
private class TestOutcome {
|
||||
private long passed;
|
||||
private long failed;
|
||||
|
||||
public long getPassed() {
|
||||
return passed;
|
||||
}
|
||||
|
||||
public void incPassed() {
|
||||
this.passed++;
|
||||
}
|
||||
|
||||
public long getFailed() {
|
||||
return failed;
|
||||
}
|
||||
|
||||
public void incFailed() {
|
||||
this.failed++;
|
||||
}
|
||||
}
|
||||
|
||||
private BaseCounts getFilteredBaseCounts(AlignmentContext context) {
|
||||
return getBaseCounts(context, MIN_BASE_QUAL, MIN_MAPPING_QUAL);
|
||||
}
|
||||
|
||||
private BaseCounts getFullBaseCounts(AlignmentContext context) {
|
||||
return getBaseCounts(context, 3, 0);
|
||||
}
|
||||
|
||||
private BaseCounts getBaseCounts(AlignmentContext context, int mbq, int mmq) {
|
||||
BaseCounts fullBaseCounts = new BaseCounts();
|
||||
for (String rg : context.getBasePileup().getReadGroups()) {
|
||||
if (!rg.equals(reducedReadGroupID)) {
|
||||
BaseCounts b = BaseCounts.createWithCounts(context.getBasePileup().getPileupForReadGroup(rg).getBaseAndMappingFilteredPileup(mbq, mmq).getBaseCounts());
|
||||
fullBaseCounts.add(b);
|
||||
}
|
||||
}
|
||||
return fullBaseCounts;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: depristo
|
||||
* Date: 4/10/11
|
||||
* Time: 8:49 AM
|
||||
*
|
||||
* A general interface for ReadCompressors. Read compressors have the following semantics:
|
||||
*
|
||||
* The accept a stream of reads, in order, and after each added read returns a compressed stream
|
||||
* of reads for emission. This stream of reads is a "reduced" representation of the total stream
|
||||
* of reads. The actual compression approach is left up to the implementing class.
|
||||
*/
|
||||
public interface Compressor {
|
||||
/**
|
||||
* Adds the read to the compressor. The returned iteratable collection of
|
||||
* reads represents the incremental compressed output.
|
||||
* @param read the next uncompressed read in the input stream to the compressor
|
||||
* @return an iterator over the incrementally available compressed reads
|
||||
*/
|
||||
@Requires("read != null")
|
||||
@Ensures("result != null")
|
||||
Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read);
|
||||
|
||||
/**
|
||||
* Must be called after the last read has been added to finalize the compressor state
|
||||
* and return the last compressed reads from the compressor.
|
||||
* @return an iterator over the final compressed reads of this compressor
|
||||
*/
|
||||
@Ensures("result != null")
|
||||
Iterable<GATKSAMRecord> close();
|
||||
}
|
||||
|
|
@ -0,0 +1,204 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.LinkedList;
|
||||
|
||||
/**
|
||||
* The element that describes the header of the sliding window.
|
||||
*
|
||||
* Each site has a header element containing the counts of each base, it's reference based location and whether or
|
||||
* not the site has insertions (to it's right). It also contains information about the bases that have been filtered
|
||||
* out due to mapping or base quality.
|
||||
*/
|
||||
public class HeaderElement {
|
||||
private BaseAndQualsCounts consensusBaseCounts; // How many A,C,G,T (and D's) are in this site.
|
||||
private BaseAndQualsCounts filteredBaseCounts; // How many A,C,G,T (and D's) were filtered out in this site.
|
||||
private int insertionsToTheRight; // How many reads in this site had insertions to the immediate right
|
||||
private int nSoftClippedBases; // How many bases in this site came from soft clipped bases
|
||||
private int location; // Genome location of this site (the sliding window knows which contig we're at
|
||||
private LinkedList<Integer> mappingQuality; // keeps the mapping quality of each read that contributed to this element (site)
|
||||
|
||||
public int getLocation() {
|
||||
return location;
|
||||
}
|
||||
|
||||
public BaseAndQualsCounts getFilteredBaseCounts() {
|
||||
return filteredBaseCounts;
|
||||
}
|
||||
|
||||
public BaseAndQualsCounts getConsensusBaseCounts() {
|
||||
return consensusBaseCounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HeaderElement with the following default values: - empty consensusBaseCounts - empty
|
||||
* filteredBaseCounts - 0 insertions to the right - empty mappingQuality list
|
||||
*
|
||||
* @param location the reference location for the new element
|
||||
*/
|
||||
public HeaderElement(int location) {
|
||||
this(new BaseAndQualsCounts(), new BaseAndQualsCounts(), 0, 0, location, new LinkedList<Integer>());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new HeaderElement with all given parameters
|
||||
*
|
||||
* @param consensusBaseCounts the BaseCounts object for the running consensus synthetic read
|
||||
* @param filteredBaseCounts the BaseCounts object for the filtered data synthetic read
|
||||
* @param insertionsToTheRight number of insertions to the right of this HeaderElement
|
||||
* @param location the reference location of this reference element
|
||||
* @param mappingQuality the list of mapping quality values of all reads that contributed to this
|
||||
* HeaderElement
|
||||
*/
|
||||
public HeaderElement(BaseAndQualsCounts consensusBaseCounts, BaseAndQualsCounts filteredBaseCounts, int insertionsToTheRight, int nSoftClippedBases, int location, LinkedList<Integer> mappingQuality) {
|
||||
this.consensusBaseCounts = consensusBaseCounts;
|
||||
this.filteredBaseCounts = filteredBaseCounts;
|
||||
this.insertionsToTheRight = insertionsToTheRight;
|
||||
this.nSoftClippedBases = nSoftClippedBases;
|
||||
this.location = location;
|
||||
this.mappingQuality = mappingQuality;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the site represented by this HeaderElement is variant according to the definitions of variant
|
||||
* by insertion, deletion and mismatches.
|
||||
*
|
||||
* @return true if site is variant by any definition. False otherwise.
|
||||
*/
|
||||
public boolean isVariant(double minVariantProportion, double minIndelProportion) {
|
||||
return hasConsensusData() && (isVariantFromInsertions(minIndelProportion) || isVariantFromMismatches(minVariantProportion) || isVariantFromDeletions(minIndelProportion) || isVariantFromSoftClips());
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new base to the HeaderElement updating all counts accordingly
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param baseQual the base quality
|
||||
* @param baseMappingQuality the mapping quality of the read this base belongs to
|
||||
*/
|
||||
public void addBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
|
||||
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
|
||||
consensusBaseCounts.incr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
|
||||
else
|
||||
filteredBaseCounts.incr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
|
||||
|
||||
this.mappingQuality.add(baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
|
||||
nSoftClippedBases += isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
|
||||
}
|
||||
|
||||
public void removeBase(byte base, byte baseQual, byte insQual, byte delQual, int baseMappingQuality, int minBaseQual, int minMappingQual, boolean isSoftClipped) {
|
||||
if (basePassesFilters(baseQual, minBaseQual, baseMappingQuality, minMappingQual))
|
||||
consensusBaseCounts.decr(base, baseQual, insQual, delQual); // If the base passes filters, it is included in the consensus base counts
|
||||
else
|
||||
filteredBaseCounts.decr(base, baseQual, insQual, delQual); // If the base fails filters, it is included with the filtered data base counts
|
||||
|
||||
this.mappingQuality.remove((Integer) baseMappingQuality); // Filtered or not, the RMS mapping quality includes all bases in this site
|
||||
nSoftClippedBases -= isSoftClipped ? 1 : 0; // if this base is softclipped, add the counter
|
||||
}
|
||||
/**
|
||||
* Adds an insertions to the right of the HeaderElement and updates all counts accordingly. All insertions
|
||||
* should be added to the right of the element.
|
||||
*/
|
||||
public void addInsertionToTheRight() {
|
||||
insertionsToTheRight++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this HeaderElement contain consensus data?
|
||||
*
|
||||
* @return whether or not this HeaderElement contains consensus data
|
||||
*/
|
||||
public boolean hasConsensusData() {
|
||||
return consensusBaseCounts.totalCount() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Does this HeaderElement contain filtered data?
|
||||
*
|
||||
* @return whether or not this HeaderElement contains filtered data
|
||||
*/
|
||||
public boolean hasFilteredData() {
|
||||
return filteredBaseCounts.totalCount() > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* A HeaderElement is empty if it has no consensus or filtered data
|
||||
*
|
||||
* @return whether or not this HeaderElement has no data
|
||||
*/
|
||||
public boolean isEmpty() {
|
||||
return (!hasFilteredData() && !hasConsensusData());
|
||||
}
|
||||
|
||||
/**
|
||||
* The RMS of the mapping qualities of all reads that contributed to this HeaderElement
|
||||
*
|
||||
* @return the RMS of the mapping qualities of all reads that contributed to this HeaderElement
|
||||
*/
|
||||
public double getRMS() {
|
||||
return MathUtils.rms(mappingQuality);
|
||||
}
|
||||
|
||||
/**
|
||||
* removes an insertion from this element (if you removed a read that had an insertion)
|
||||
*/
|
||||
public void removeInsertionToTheRight() {
|
||||
this.insertionsToTheRight--;
|
||||
if (insertionsToTheRight < 0)
|
||||
throw new ReviewedStingException("Removed too many insertions, header is now negative!");
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess insertions
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromInsertions(double minIndelProportion) {
|
||||
int numberOfBases = consensusBaseCounts.totalCount();
|
||||
if (numberOfBases == 0 && insertionsToTheRight > 0)
|
||||
return true; // we only have insertions
|
||||
else if (numberOfBases == 0)
|
||||
return false; // we don't have anything
|
||||
|
||||
// if we have bases and insertions, check the ratio
|
||||
return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess deletions
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromDeletions(double minIndelProportion) {
|
||||
return consensusBaseCounts.baseIndexWithMostCounts() == BaseIndex.D || consensusBaseCounts.baseCountProportion(BaseIndex.D) > minIndelProportion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whether or not the HeaderElement is variant due to excess mismatches
|
||||
*
|
||||
* @return whether or not the HeaderElement is variant due to excess insertions
|
||||
*/
|
||||
private boolean isVariantFromMismatches(double minVariantProportion) {
|
||||
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels();
|
||||
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
|
||||
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
|
||||
}
|
||||
|
||||
/**
|
||||
* This handles the special case where we have more bases that came from soft clips than bases that came from
|
||||
* normal bases by forcing it to become a variant region. We don't want a consensus based on too little information.
|
||||
*
|
||||
* @return true if we had more soft clipped bases contributing to this site than matches/mismatches.
|
||||
*/
|
||||
private boolean isVariantFromSoftClips() {
|
||||
return nSoftClippedBases >= (consensusBaseCounts.totalCount() - nSoftClippedBases);
|
||||
}
|
||||
|
||||
private boolean basePassesFilters(byte baseQual, int minBaseQual, int baseMappingQuality, int minMappingQual) {
|
||||
return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/*
|
||||
* Copyright (c) 2009 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
*
|
||||
* @author depristo
|
||||
*/
|
||||
public class MultiSampleCompressor implements Compressor {
|
||||
protected static final Logger logger = Logger.getLogger(MultiSampleCompressor.class);
|
||||
|
||||
protected Map<String, SingleSampleCompressor> compressorsPerSample = new HashMap<String, SingleSampleCompressor>();
|
||||
|
||||
public MultiSampleCompressor(SAMFileHeader header,
|
||||
final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
for ( String name : SampleUtils.getSAMFileSamples(header) ) {
|
||||
compressorsPerSample.put(name,
|
||||
new SingleSampleCompressor(name, contextSize, downsampleCoverage,
|
||||
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> addAlignment(GATKSAMRecord read) {
|
||||
String sample = read.getReadGroup().getSample();
|
||||
SingleSampleCompressor compressor = compressorsPerSample.get(sample);
|
||||
if ( compressor == null )
|
||||
throw new ReviewedStingException("No compressor for sample " + sample);
|
||||
return compressor.addAlignment(read);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
SortedSet<GATKSAMRecord> reads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
for ( SingleSampleCompressor comp : compressorsPerSample.values() )
|
||||
for ( GATKSAMRecord read : comp.close() )
|
||||
reads.add(read);
|
||||
return reads;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,678 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.util.SequenceUtil;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.*;
|
||||
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionType;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadFilters;
|
||||
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocComparator;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Reduces the BAM file using read based compression that keeps only essential information for variant calling
|
||||
* <p/>
|
||||
* <p>
|
||||
* This walker will generated reduced versions of the BAM files that still follow the BAM spec
|
||||
* and contain all the information necessary for the GSA variant calling pipeline. Some options
|
||||
* allow you to tune in how much compression you want to achieve. The default values have been
|
||||
* shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
|
||||
* savings in file size and performance of the downstream tools.
|
||||
* <p/>
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The BAM file to be compressed
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* The compressed (reduced) BAM file.
|
||||
* </p>
|
||||
* <p/>
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T ReduceReads \
|
||||
* -I myData.bam \
|
||||
* -o myData.reduced.bam
|
||||
* </pre>
|
||||
*/
|
||||
|
||||
@PartitionBy(PartitionType.INTERVAL)
|
||||
@ReadFilters({UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class, BadCigarFilter.class})
|
||||
public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {
|
||||
|
||||
@Output
|
||||
protected StingSAMFileWriter out;
|
||||
|
||||
/**
|
||||
* The number of bases to keep around mismatches (potential variation)
|
||||
*/
|
||||
@Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
|
||||
protected int contextSize = 10;
|
||||
|
||||
/**
|
||||
* The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
|
||||
* mapping quality below this threshold will not be counted towards consensus, but are still counted
|
||||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
|
||||
protected int minMappingQuality = 20;
|
||||
|
||||
/**
|
||||
* The minimum base quality to be considered for the consensus synthetic read. Reads that have
|
||||
* base quality below this threshold will not be counted towards consensus, but are still counted
|
||||
* towards variable regions.
|
||||
*/
|
||||
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
|
||||
protected byte minBaseQual = 20;
|
||||
|
||||
/**
|
||||
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
|
||||
* lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
|
||||
*/
|
||||
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
|
||||
protected byte minTailQuality = 2;
|
||||
|
||||
/**
|
||||
* Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
|
||||
* and read group).
|
||||
*/
|
||||
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
|
||||
protected boolean DONT_SIMPLIFY_READS = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
|
||||
* The program will behave correctly in those cases.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
|
||||
protected boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
|
||||
|
||||
/**
|
||||
* Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
|
||||
* quality.
|
||||
*/
|
||||
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
|
||||
protected boolean DONT_CLIP_LOW_QUAL_TAILS = false;
|
||||
|
||||
/**
|
||||
* Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
|
||||
* base left by the aligner and use the high quality soft clipped bases in it's traversal algorithm to identify variant
|
||||
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
|
||||
*/
|
||||
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
|
||||
protected boolean DONT_USE_SOFTCLIPPED_BASES = false;
|
||||
|
||||
/**
|
||||
* Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
|
||||
* uniqueness and reads with similar name will still have similar compressed names. Note: If you scatter/gather
|
||||
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
|
||||
*/
|
||||
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
|
||||
protected boolean DONT_COMPRESS_READ_NAMES = false;
|
||||
|
||||
/**
|
||||
* Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
|
||||
* border.
|
||||
*/
|
||||
@Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
|
||||
protected boolean HARD_CLIP_TO_INTERVAL = false;
|
||||
|
||||
/**
|
||||
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
*/
|
||||
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
|
||||
protected double minAltProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
|
||||
* considered consensus.
|
||||
*/
|
||||
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
|
||||
protected double minIndelProportionToTriggerVariant = 0.05;
|
||||
|
||||
/**
|
||||
* Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
|
||||
* A value of 0 turns downsampling off.
|
||||
*/
|
||||
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
|
||||
protected int downsampleCoverage = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dl", doc = "", required = false)
|
||||
protected int debugLevel = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "", shortName = "dr", doc = "", required = false)
|
||||
protected String debugRead = "";
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
|
||||
protected DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
|
||||
private boolean NO_PG_TAG = false;
|
||||
|
||||
public enum DownsampleStrategy {
|
||||
Normal,
|
||||
Adaptive
|
||||
}
|
||||
|
||||
protected int totalReads = 0;
|
||||
int nCompressedReads = 0;
|
||||
|
||||
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
|
||||
Long nextReadNumber = 1L; // The next number to use for the compressed read name.
|
||||
|
||||
SortedSet<GenomeLoc> intervalList;
|
||||
|
||||
private static final String PROGRAM_RECORD_NAME = "GATK ReduceReads"; // The name that will go in the @PG tag
|
||||
|
||||
/**
|
||||
* Basic generic initialization of the readNameHash and the intervalList. Output initialization
|
||||
* is done at the reduceInit method
|
||||
*/
|
||||
@Override
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
GenomeAnalysisEngine toolkit = getToolkit();
|
||||
readNameHash = new HashMap<String, Long>(); // prepare the read name hash to keep track of what reads have had their read names compressed
|
||||
intervalList = new TreeSet<GenomeLoc>(new GenomeLocComparator()); // get the interval list from the engine. If no interval list was provided, the walker will work in WGS mode
|
||||
|
||||
if (toolkit.getIntervals() != null)
|
||||
intervalList.addAll(toolkit.getIntervals());
|
||||
|
||||
if (!NO_PG_TAG)
|
||||
Utils.setupWriter(out, toolkit, false, true, this, PROGRAM_RECORD_NAME);
|
||||
else
|
||||
out.setPresorted(false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes in a read and prepares it for the SlidingWindow machinery by performing the
|
||||
* following optional clipping operations:
|
||||
* 1. Hard clip adaptor sequences
|
||||
* 2. Hard clip low quality tails
|
||||
* 3. Hard clip all remaining soft clipped bases
|
||||
* 4. Hard clip read to the intervals in the interval list (this step may produce multiple reads)
|
||||
*
|
||||
* @param ref default map parameter
|
||||
* @param read default map parameter
|
||||
* @param metaDataTracker default map parameter
|
||||
* @return a linked list with all the reads produced by the clipping operations
|
||||
*/
|
||||
@Override
|
||||
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
|
||||
LinkedList<GATKSAMRecord> mappedReads;
|
||||
totalReads++;
|
||||
if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
|
||||
System.out.println("Found debug read!");
|
||||
|
||||
if (debugLevel == 1)
|
||||
System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd());
|
||||
|
||||
// we write the actual alignment starts to their respectiv alignment shift tags in the temporary
|
||||
// attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file
|
||||
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart());
|
||||
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd());
|
||||
|
||||
if (!DONT_SIMPLIFY_READS)
|
||||
read.simplify(); // Clear all unnecessary attributes
|
||||
if (!DONT_CLIP_ADAPTOR_SEQUENCES)
|
||||
read = ReadClipper.hardClipAdaptorSequence(read); // Strip away adaptor sequences, if any.
|
||||
if (!DONT_CLIP_LOW_QUAL_TAILS)
|
||||
read = ReadClipper.hardClipLowQualEnds(read, minTailQuality); // Clip low quality tails
|
||||
if (!isWholeGenome()) {
|
||||
if (HARD_CLIP_TO_INTERVAL)
|
||||
mappedReads = hardClipReadToInterval(read); // Hard clip the remainder of the read to the desired interval
|
||||
else {
|
||||
mappedReads = new LinkedList<GATKSAMRecord>();
|
||||
mappedReads.add(read);
|
||||
}
|
||||
}
|
||||
else {
|
||||
mappedReads = new LinkedList<GATKSAMRecord>();
|
||||
if (!read.isEmpty())
|
||||
mappedReads.add(read);
|
||||
}
|
||||
|
||||
if (!mappedReads.isEmpty() && !DONT_USE_SOFTCLIPPED_BASES) {
|
||||
LinkedList<GATKSAMRecord> tempList = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord mRead : mappedReads) {
|
||||
GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualitySoftClips(mRead, minBaseQual);
|
||||
if (!clippedRead.isEmpty())
|
||||
tempList.add(clippedRead);
|
||||
}
|
||||
mappedReads = tempList;
|
||||
}
|
||||
|
||||
if (debugLevel == 1)
|
||||
for (GATKSAMRecord mappedRead : mappedReads)
|
||||
System.out.printf("MAPPED: %s %d %d\n", mappedRead.getCigar(), mappedRead.getAlignmentStart(), mappedRead.getAlignmentEnd());
|
||||
|
||||
return mappedReads;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the ReduceReadsStash that keeps track of all reads that are waiting to
|
||||
* enter the SlidingWindow machinery. The stash makes sure reads are served in order
|
||||
* even though map() may generate reads that are only supposed to enter the machinery
|
||||
* in the future.
|
||||
*
|
||||
* @return the empty stash
|
||||
*/
|
||||
@Override
|
||||
public ReduceReadsStash reduceInit() {
|
||||
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy));
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes the list of reads produced by map(), adds them to the stash (which keeps them sorted) and process
|
||||
* all reads that come before the original read (the read that was passed to map) including the original
|
||||
* read. This is where we send reads, in order, to the SlidingWindow machinery.
|
||||
*
|
||||
* @param mappedReads the list of reads sent by map
|
||||
* @param stash the stash that keeps the reads in order for processing
|
||||
* @return the stash with all reads that have not been processed yet
|
||||
*/
|
||||
public ReduceReadsStash reduce(LinkedList<GATKSAMRecord> mappedReads, ReduceReadsStash stash) {
|
||||
if (debugLevel == 1)
|
||||
stash.print();
|
||||
|
||||
boolean firstRead = true;
|
||||
for (GATKSAMRecord read : mappedReads) {
|
||||
boolean originalRead = firstRead && isOriginalRead(mappedReads, read);
|
||||
|
||||
if (read.getReadLength() == 0)
|
||||
throw new ReviewedStingException("Empty read sent to reduce, this should never happen! " + read.getReadName() + " -- " + read.getCigar() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
|
||||
|
||||
if (originalRead) {
|
||||
List<GATKSAMRecord> readsReady = new LinkedList<GATKSAMRecord>();
|
||||
readsReady.addAll(stash.getAllReadsBefore(read));
|
||||
readsReady.add(read);
|
||||
|
||||
for (GATKSAMRecord readReady : readsReady) {
|
||||
if (debugLevel == 1)
|
||||
System.out.println("REDUCE: " + readReady.getCigar() + " " + readReady.getAlignmentStart() + " " + readReady.getAlignmentEnd());
|
||||
|
||||
for (GATKSAMRecord compressedRead : stash.compress(readReady))
|
||||
outputRead(compressedRead);
|
||||
|
||||
}
|
||||
} else
|
||||
stash.add(read);
|
||||
|
||||
firstRead = false;
|
||||
}
|
||||
|
||||
return stash;
|
||||
}
|
||||
|
||||
/**
|
||||
* Now that now more reads will come, we process all the remaining reads in the stash, in order.
|
||||
*
|
||||
* @param stash the ReduceReadsStash with all unprocessed reads (from reduce)
|
||||
*/
|
||||
@Override
|
||||
public void onTraversalDone(ReduceReadsStash stash) {
|
||||
|
||||
// output any remaining reads in the compressor
|
||||
for (GATKSAMRecord read : stash.close())
|
||||
outputRead(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Hard clips away all parts of the read that doesn't agree with the intervals selected.
|
||||
*
|
||||
* Note: If read overlaps more than one interval, it will be hard clipped to all
|
||||
* the intervals it overlaps with
|
||||
*
|
||||
* @param read the read to be hard clipped to the interval.
|
||||
* @return a shallow copy of the read hard clipped to the interval
|
||||
*/
|
||||
private LinkedList<GATKSAMRecord> hardClipReadToInterval(GATKSAMRecord read) {
|
||||
LinkedList<GATKSAMRecord> clippedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
GenomeLoc intervalOverlapped = null; // marks the interval to which the original read overlapped (so we can cut all previous intervals from the list)
|
||||
|
||||
boolean originalRead = true; // false if this is the right tail of the original read
|
||||
boolean overlap; // keeps track of the interval that overlapped the original read
|
||||
boolean doneClipping; // triggers an early exit if we are done clipping this read
|
||||
|
||||
if (isWholeGenome())
|
||||
clippedReads.add(read); // if we don't have intervals (wgs) the read goes in unchanged
|
||||
|
||||
for (GenomeLoc interval : intervalList) {
|
||||
|
||||
if (read.isEmpty()) // nothing to do with an empty read (could have been fully clipped before)
|
||||
break;
|
||||
|
||||
GATKSAMRecord clippedRead = null; // this will hold the read clipped to the interval to be added in the end of the switch
|
||||
|
||||
switch (ReadUtils.getReadAndIntervalOverlapType(read, interval)) {
|
||||
case NO_OVERLAP_RIGHT: // no reads on this interval, check the next interval if this is the original read
|
||||
if (!originalRead) // something went wrong if this is the tail of the read
|
||||
throw new ReviewedStingException("tail of the read should never NO_OVERLAP_RIGHT the following interval. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
overlap = false;
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
|
||||
case NO_OVERLAP_HARDCLIPPED_RIGHT: // read used to overlap but got hard clipped and doesn't overlap anymore
|
||||
if (originalRead) {
|
||||
overlap = true; // effectively, we have found the read's location and now we are going to try and match it's tail (which happens to be the entire read).
|
||||
clippedRead = GATKSAMRecord.emptyRead(read);
|
||||
} else
|
||||
overlap = false;
|
||||
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
case NO_OVERLAP_CONTIG: // read is in a different contig
|
||||
if (originalRead) { // the original read can be in a bigger contig, but not on a smaller one.
|
||||
if (read.getReferenceIndex() < interval.getContigIndex())
|
||||
throw new ReviewedStingException("read is behind interval list. (contig) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
else {
|
||||
overlap = false;
|
||||
doneClipping = false;
|
||||
}
|
||||
} // tail read CANNOT be in a different contig.
|
||||
else {
|
||||
if (read.getReferenceIndex() < interval.getContigIndex()) {
|
||||
overlap = false;
|
||||
doneClipping = true;
|
||||
} else
|
||||
throw new ReviewedStingException("Tail read is in bigger contig than interval traversal. " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
|
||||
}
|
||||
break;
|
||||
|
||||
case NO_OVERLAP_LEFT:
|
||||
if (originalRead) // if this is the first read this should never happen.
|
||||
throw new ReviewedStingException("original read cannot be behind the first interval. (position) " + read.getReadName() + " -- " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd() + " x " + interval.getLocation().toString());
|
||||
|
||||
overlap = false;
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
case NO_OVERLAP_HARDCLIPPED_LEFT: // read used to overlap but got hard clipped and doesn't overlap anymore
|
||||
overlap = originalRead; // if this is the original read, we should not advance the interval list, the original overlap was here.
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
case OVERLAP_LEFT: // clip the left tail of the read
|
||||
clippedRead = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStart() - 1);
|
||||
|
||||
overlap = true;
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
case OVERLAP_RIGHT: // clip the right tail of the read and try to match it to the next interval
|
||||
clippedRead = ReadClipper.hardClipByReferenceCoordinatesRightTail(read, interval.getStop() + 1);
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
|
||||
|
||||
overlap = true;
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
case OVERLAP_LEFT_AND_RIGHT: // clip both left and right ends of the read
|
||||
clippedRead = ReadClipper.hardClipBothEndsByReferenceCoordinates(read, interval.getStart() - 1, interval.getStop() + 1);
|
||||
read = ReadClipper.hardClipByReferenceCoordinatesLeftTail(read, interval.getStop());
|
||||
|
||||
overlap = true;
|
||||
doneClipping = false;
|
||||
break;
|
||||
|
||||
case OVERLAP_CONTAINED: // don't do anything to the read
|
||||
clippedRead = read;
|
||||
|
||||
overlap = true;
|
||||
doneClipping = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
throw new ReviewedStingException("interval overlap returned an unknown / unhandled state. If new state was added to intervalOverlap, it should be handled by hardClipReadToInterval.");
|
||||
}
|
||||
|
||||
if (overlap && originalRead)
|
||||
intervalOverlapped = interval;
|
||||
|
||||
if (clippedRead != null) {
|
||||
originalRead = false;
|
||||
|
||||
if (!clippedRead.isEmpty())
|
||||
clippedReads.add(clippedRead); // if the read overlaps the interval entirely within a deletion, it will be entirely clipped off
|
||||
}
|
||||
|
||||
if (doneClipping)
|
||||
break;
|
||||
}
|
||||
|
||||
if (intervalOverlapped != null)
|
||||
intervalList = intervalList.tailSet(intervalOverlapped);
|
||||
|
||||
return clippedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compresses the read name and adds it to output BAM file (reduced BAM)
|
||||
* after performing some quality control
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
private void outputRead(GATKSAMRecord read) {
|
||||
if (debugLevel == 2) {
|
||||
checkForHighMismatch(read);
|
||||
checkCigar(read);
|
||||
}
|
||||
|
||||
if (read.isReducedRead())
|
||||
nCompressedReads++;
|
||||
else {
|
||||
int originalAlignmentStart = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT);
|
||||
int originalAlignmentEnd = (Integer) read.getTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT);
|
||||
|
||||
int startShift = originalAlignmentStart - read.getUnclippedStart(); // we annotate the shifts for better compression
|
||||
int endShift = read.getUnclippedEnd() - originalAlignmentEnd; // we annotate the shifts for better compression
|
||||
|
||||
if (startShift > 0)
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start)
|
||||
if (endShift > 0)
|
||||
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end)
|
||||
|
||||
totalReads++;
|
||||
}
|
||||
|
||||
if (debugLevel == 1)
|
||||
System.out.println("BAM: " + read.getCigar() + " " + read.getAlignmentStart() + " " + read.getAlignmentEnd());
|
||||
|
||||
// if (!DONT_USE_SOFTCLIPPED_BASES)
|
||||
// reSoftClipBases(read);
|
||||
|
||||
if (!DONT_COMPRESS_READ_NAMES)
|
||||
compressReadName(read);
|
||||
|
||||
out.addAlignment(read);
|
||||
}
|
||||
|
||||
private void reSoftClipBases(GATKSAMRecord read) {
|
||||
Integer left = (Integer) read.getTemporaryAttribute("SL");
|
||||
Integer right = (Integer) read.getTemporaryAttribute("SR");
|
||||
if (left != null || right != null) {
|
||||
Cigar newCigar = new Cigar();
|
||||
for (CigarElement element : read.getCigar().getCigarElements()) {
|
||||
newCigar.add(new CigarElement(element.getLength(), element.getOperator()));
|
||||
}
|
||||
|
||||
if (left != null) {
|
||||
newCigar = updateFirstSoftClipCigarElement(left, newCigar);
|
||||
read.setAlignmentStart(read.getAlignmentStart() + left);
|
||||
}
|
||||
|
||||
if (right != null) {
|
||||
Cigar invertedCigar = invertCigar(newCigar);
|
||||
newCigar = invertCigar(updateFirstSoftClipCigarElement(right, invertedCigar));
|
||||
}
|
||||
read.setCigar(newCigar);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Facility routine to revert the first element of a Cigar string (skipping hard clips) into a soft-clip.
|
||||
* To be used on both ends if provided a flipped Cigar
|
||||
*
|
||||
* @param softClipSize the length of the soft clipped element to add
|
||||
* @param originalCigar the original Cigar string
|
||||
* @return a new Cigar object with the soft clips added
|
||||
*/
|
||||
private Cigar updateFirstSoftClipCigarElement (int softClipSize, Cigar originalCigar) {
|
||||
Cigar result = new Cigar();
|
||||
CigarElement leftElement = new CigarElement(softClipSize, CigarOperator.S);
|
||||
boolean updated = false;
|
||||
for (CigarElement element : originalCigar.getCigarElements()) {
|
||||
if (!updated && element.getOperator() == CigarOperator.M) {
|
||||
result.add(leftElement);
|
||||
int newLength = element.getLength() - softClipSize;
|
||||
if (newLength > 0)
|
||||
result.add(new CigarElement(newLength, CigarOperator.M));
|
||||
updated = true;
|
||||
}
|
||||
else
|
||||
result.add(element);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a cigar string, returns the inverted cigar string.
|
||||
*
|
||||
* @param cigar the original cigar
|
||||
* @return the inverted cigar
|
||||
*/
|
||||
private Cigar invertCigar(Cigar cigar) {
|
||||
Stack<CigarElement> stack = new Stack<CigarElement>();
|
||||
for (CigarElement e : cigar.getCigarElements())
|
||||
stack.push(e);
|
||||
Cigar inverted = new Cigar();
|
||||
while (!stack.empty()) {
|
||||
inverted.add(stack.pop());
|
||||
}
|
||||
return inverted;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Quality control procedure that checks if the consensus reads contains too many
|
||||
* mismatches with the reference. This should never happen and is a good trigger for
|
||||
* errors with the algorithm.
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
private void checkForHighMismatch(GATKSAMRecord read) {
|
||||
final int start = read.getAlignmentStart();
|
||||
final int stop = read.getAlignmentEnd();
|
||||
final byte[] ref = getToolkit().getReferenceDataSource().getReference().getSubsequenceAt(read.getReferenceName(), start, stop).getBases();
|
||||
final int nm = SequenceUtil.countMismatches(read, ref, start - 1);
|
||||
final int readLen = read.getReadLength();
|
||||
final double nmFraction = nm / (1.0 * readLen);
|
||||
if (nmFraction > 0.4 && readLen > 20 && read.getAttribute(GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG) != null && read.getReadName().startsWith("Consensus"))
|
||||
throw new ReviewedStingException("BUG: High mismatch fraction found in read " + read.getReadName() + " position: " + read.getReferenceName() + ":" + read.getAlignmentStart() + "-" + read.getAlignmentEnd());
|
||||
}
|
||||
|
||||
private void checkCigar (GATKSAMRecord read) {
|
||||
if (read.getCigar().isValid(null, -1) != null) {
|
||||
throw new ReviewedStingException("BUG: cigar string is not valid: " + read.getCigarString());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Compresses the read name using the readNameHash if we have already compressed
|
||||
* this read name before.
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
private void compressReadName(GATKSAMRecord read) {
|
||||
String name = read.getReadName();
|
||||
String compressedName = read.isReducedRead() ? "C" : "";
|
||||
if (readNameHash.containsKey(name))
|
||||
compressedName += readNameHash.get(name).toString();
|
||||
else {
|
||||
readNameHash.put(name, nextReadNumber);
|
||||
compressedName += nextReadNumber.toString();
|
||||
nextReadNumber++;
|
||||
}
|
||||
|
||||
read.setReadName(compressedName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the read is the original read that went through map().
|
||||
*
|
||||
* This is important to know so we can decide what reads to pull from the stash. Only reads that came before the original read should be pulled.
|
||||
*
|
||||
* @param list the list
|
||||
* @param read the read
|
||||
* @return Returns true if the read is the original read that went through map().
|
||||
*/
|
||||
private boolean isOriginalRead(LinkedList<GATKSAMRecord> list, GATKSAMRecord read) {
|
||||
return isWholeGenome() || list.getFirst().equals(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks whether or not the intervalList is empty, meaning we're running in WGS mode.
|
||||
*
|
||||
* @return whether or not we're running in WGS mode.
|
||||
*/
|
||||
private boolean isWholeGenome() {
|
||||
return intervalList.isEmpty();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,110 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
* This class implements a "read stash" that keeps reads always sorted in alignment order. Useful
|
||||
* for read walkers that alter the alignment information of the incoming reads, but need to
|
||||
* maintain the reads sorted for the reduce step. (e.g. ReduceReads)
|
||||
*/
|
||||
|
||||
public class ReduceReadsStash {
|
||||
protected MultiSampleCompressor compressor;
|
||||
SortedSet<GATKSAMRecord> outOfOrderReads;
|
||||
|
||||
/**
|
||||
* Creates a stash with the default sorting order (read alignment)
|
||||
* @param compressor the MultiSampleCompressor object to be used with this stash (for stash.close())
|
||||
*/
|
||||
public ReduceReadsStash(MultiSampleCompressor compressor) {
|
||||
this.compressor = compressor;
|
||||
this.outOfOrderReads = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
}
|
||||
|
||||
/**
|
||||
* Get all reads before a given read (for processing)
|
||||
*
|
||||
* @param read the original read
|
||||
* @return all reads that have alignment start before the original read.
|
||||
*/
|
||||
public List<GATKSAMRecord> getAllReadsBefore(GATKSAMRecord read) {
|
||||
List<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
|
||||
GATKSAMRecord newHead = null;
|
||||
|
||||
for (GATKSAMRecord stashedRead : outOfOrderReads) {
|
||||
if (ReadUtils.compareSAMRecords(stashedRead, read) <= 0)
|
||||
result.add(stashedRead);
|
||||
else {
|
||||
newHead = stashedRead;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (result.size() > 0) {
|
||||
if (result.size() == outOfOrderReads.size())
|
||||
outOfOrderReads.clear();
|
||||
else
|
||||
outOfOrderReads = new TreeSet<GATKSAMRecord>(outOfOrderReads.tailSet(newHead));
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* sends the read to the MultiSampleCompressor
|
||||
*
|
||||
* @param read the read to be compressed
|
||||
* @return any compressed reads that may have resulted from adding this read to the machinery (due to the sliding window)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> compress(GATKSAMRecord read) {
|
||||
return compressor.addAlignment(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a read to the stash
|
||||
*
|
||||
* @param read any read
|
||||
*/
|
||||
public void add(GATKSAMRecord read) {
|
||||
outOfOrderReads.add(read);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the stash, processing all remaining reads in order
|
||||
*
|
||||
* @return a list of all the reads produced by the SlidingWindow machinery)
|
||||
*/
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
LinkedList<GATKSAMRecord> result = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
// compress all the stashed reads (in order)
|
||||
for (GATKSAMRecord read : outOfOrderReads)
|
||||
for (GATKSAMRecord compressedRead : compressor.addAlignment(read))
|
||||
result.add(compressedRead);
|
||||
|
||||
// output any remaining reads from the compressor
|
||||
for (GATKSAMRecord read : compressor.close())
|
||||
result.add(read);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Useful debug functionality, outputs all elements in the stash
|
||||
*/
|
||||
public void print() {
|
||||
int i = 1;
|
||||
System.out.println("Stash Contents:");
|
||||
for (GATKSAMRecord read : outOfOrderReads)
|
||||
System.out.println(String.format("%3d: %s %d %d", i++, read.getCigarString(), read.getAlignmentStart(), read.getAlignmentEnd()));
|
||||
System.out.println();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,83 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.TreeSet;
|
||||
|
||||
/**
|
||||
*
|
||||
* @author depristo
|
||||
* @version 0.1
|
||||
*/
|
||||
public class SingleSampleCompressor implements Compressor {
|
||||
protected static final Logger logger = Logger.getLogger(SingleSampleCompressor.class);
|
||||
|
||||
protected final int contextSize;
|
||||
protected final int downsampleCoverage;
|
||||
protected int minMappingQuality;
|
||||
protected int slidingWindowCounter;
|
||||
|
||||
protected final String sampleName;
|
||||
|
||||
protected SlidingWindow slidingWindow;
|
||||
protected double minAltProportionToTriggerVariant;
|
||||
protected double minIndelProportionToTriggerVariant;
|
||||
protected int minBaseQual;
|
||||
|
||||
protected ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
|
||||
public SingleSampleCompressor(final String sampleName,
|
||||
final int contextSize,
|
||||
final int downsampleCoverage,
|
||||
final int minMappingQuality,
|
||||
final double minAltProportionToTriggerVariant,
|
||||
final double minIndelProportionToTriggerVariant,
|
||||
final int minBaseQual,
|
||||
final ReduceReads.DownsampleStrategy downsampleStrategy) {
|
||||
this.sampleName = sampleName;
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
this.minMappingQuality = minMappingQuality;
|
||||
this.slidingWindowCounter = 0;
|
||||
this.minAltProportionToTriggerVariant = minAltProportionToTriggerVariant;
|
||||
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
|
||||
this.minBaseQual = minBaseQual;
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
}
|
||||
|
||||
/**
|
||||
* @{inheritDoc}
|
||||
*/
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> addAlignment( GATKSAMRecord read ) {
|
||||
TreeSet<GATKSAMRecord> result = new TreeSet<GATKSAMRecord>(new AlignmentStartWithNoTiesComparator());
|
||||
int readOriginalStart = read.getUnclippedStart();
|
||||
|
||||
// create a new window if:
|
||||
if ((slidingWindow != null) &&
|
||||
( ( read.getReferenceIndex() != slidingWindow.getContigIndex() ) || // this is a brand new contig
|
||||
(readOriginalStart - contextSize > slidingWindow.getStopLocation()))) { // this read is too far away from the end of the current sliding window
|
||||
|
||||
// close the current sliding window
|
||||
result.addAll(slidingWindow.close());
|
||||
slidingWindow = null; // so we create a new one on the next if
|
||||
}
|
||||
|
||||
if ( slidingWindow == null) { // this is the first read
|
||||
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities());
|
||||
slidingWindowCounter++;
|
||||
}
|
||||
|
||||
result.addAll(slidingWindow.addRead(read));
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterable<GATKSAMRecord> close() {
|
||||
return (slidingWindow != null) ? slidingWindow.close() : new TreeSet<GATKSAMRecord>();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,713 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.gatk.downsampling.FractionalDownsampler;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.ListIterator;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: roger
|
||||
* Date: 8/3/11
|
||||
* Time: 2:24 PM
|
||||
*/
|
||||
public class SlidingWindow {
|
||||
|
||||
// Sliding Window data
|
||||
final private LinkedList<GATKSAMRecord> readsInWindow;
|
||||
final private LinkedList<HeaderElement> windowHeader;
|
||||
protected int contextSize; // the largest context size (between mismatches and indels)
|
||||
protected int stopLocation;
|
||||
protected String contig;
|
||||
protected int contigIndex;
|
||||
protected SAMFileHeader header;
|
||||
protected GATKSAMReadGroupRecord readGroupAttribute;
|
||||
protected int downsampleCoverage;
|
||||
|
||||
// Running consensus data
|
||||
protected SyntheticRead runningConsensus;
|
||||
protected int consensusCounter;
|
||||
protected String consensusReadName;
|
||||
|
||||
// Filtered Data Consensus data
|
||||
protected SyntheticRead filteredDataConsensus;
|
||||
protected int filteredDataConsensusCounter;
|
||||
protected String filteredDataReadName;
|
||||
|
||||
|
||||
// Additional parameters
|
||||
protected double MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to mismatches
|
||||
protected double MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT; // proportion has to be greater than this value to trigger variant region due to deletions
|
||||
protected int MIN_BASE_QUAL_TO_COUNT; // qual has to be greater than or equal to this value
|
||||
protected int MIN_MAPPING_QUALITY;
|
||||
|
||||
protected ReduceReads.DownsampleStrategy downsampleStrategy;
|
||||
private boolean hasIndelQualities;
|
||||
|
||||
/**
|
||||
* The types of synthetic reads to use in the finalizeAndAdd method
|
||||
*/
|
||||
private enum ConsensusType {
|
||||
CONSENSUS,
|
||||
FILTERED,
|
||||
BOTH
|
||||
}
|
||||
|
||||
public int getStopLocation() {
|
||||
return stopLocation;
|
||||
}
|
||||
|
||||
public String getContig() {
|
||||
return contig;
|
||||
}
|
||||
|
||||
public int getContigIndex() {
|
||||
return contigIndex;
|
||||
}
|
||||
|
||||
public int getStartLocation() {
|
||||
return windowHeader.isEmpty() ? -1 : windowHeader.peek().getLocation();
|
||||
}
|
||||
|
||||
|
||||
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) {
|
||||
this.stopLocation = -1;
|
||||
this.contextSize = contextSize;
|
||||
this.downsampleCoverage = downsampleCoverage;
|
||||
|
||||
this.MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT = minAltProportionToTriggerVariant;
|
||||
this.MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT = minIndelProportionToTriggerVariant;
|
||||
this.MIN_BASE_QUAL_TO_COUNT = minBaseQual;
|
||||
this.MIN_MAPPING_QUALITY = minMappingQuality;
|
||||
|
||||
this.windowHeader = new LinkedList<HeaderElement>();
|
||||
this.readsInWindow = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
this.header = header;
|
||||
this.readGroupAttribute = readGroupAttribute;
|
||||
|
||||
this.consensusCounter = 0;
|
||||
this.consensusReadName = "Consensus-" + windowNumber + "-";
|
||||
|
||||
this.filteredDataConsensusCounter = 0;
|
||||
this.filteredDataReadName = "Filtered-" + windowNumber + "-";
|
||||
|
||||
this.runningConsensus = null;
|
||||
this.filteredDataConsensus = null;
|
||||
|
||||
this.downsampleStrategy = downsampleStrategy;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a read to the sliding window and slides the window accordingly.
|
||||
*
|
||||
* Reads are assumed to be in order, therefore, when a read is added the sliding window can
|
||||
* assume that no more reads will affect read.getUnclippedStart() - contextSizeMismatches. The window
|
||||
* slides forward to that position and returns all reads that may have been finalized in the
|
||||
* sliding process.
|
||||
*
|
||||
* @param read the read
|
||||
* @return a list of reads that have been finished by sliding the window.
|
||||
*/
|
||||
public List<GATKSAMRecord> addRead(GATKSAMRecord read) {
|
||||
updateHeaderCounts(read, false); // update the window header counts
|
||||
readsInWindow.add(read); // add read to sliding reads
|
||||
return slideWindow(read.getUnclippedStart());
|
||||
}
|
||||
|
||||
/**
|
||||
* returns the next complete or incomplete variant region between 'from' (inclusive) and 'to' (exclusive)
|
||||
*
|
||||
* @param from beginning window header index of the search window (inclusive)
|
||||
* @param to end window header index of the search window (exclusive)
|
||||
* @param variantSite boolean array with true marking variant regions
|
||||
* @return null if nothing is variant, start/stop if there is a complete variant region, start/-1 if there is an incomplete variant region.
|
||||
*/
|
||||
private Pair<Integer, Integer> getNextVariantRegion(int from, int to, boolean[] variantSite) {
|
||||
boolean foundStart = false;
|
||||
int variantRegionStartIndex = 0;
|
||||
for (int i=from; i<to; i++) {
|
||||
if (variantSite[i] && !foundStart) {
|
||||
variantRegionStartIndex = i;
|
||||
foundStart = true;
|
||||
}
|
||||
else if(!variantSite[i] && foundStart) {
|
||||
return(new Pair<Integer, Integer>(variantRegionStartIndex, i-1));
|
||||
}
|
||||
}
|
||||
return (foundStart) ? new Pair<Integer, Integer>(variantRegionStartIndex, -1) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a list with all the complete and incomplete variant regions within 'from' (inclusive) and 'to' (exclusive)
|
||||
*
|
||||
* @param from beginning window header index of the search window (inclusive)
|
||||
* @param to end window header index of the search window (exclusive)
|
||||
* @param variantSite boolean array with true marking variant regions
|
||||
* @return a list with start/stops of variant regions following getNextVariantRegion description
|
||||
*/
|
||||
private List<Pair<Integer, Integer>> getAllVariantRegions(int from, int to, boolean[] variantSite) {
|
||||
List<Pair<Integer,Integer>> regions = new LinkedList<Pair<Integer, Integer>>();
|
||||
int index = from;
|
||||
while(index < to) {
|
||||
Pair<Integer,Integer> result = getNextVariantRegion(index, to, variantSite);
|
||||
if (result == null)
|
||||
break;
|
||||
|
||||
regions.add(result);
|
||||
if (result.getSecond() < 0)
|
||||
break;
|
||||
index = result.getSecond() + 1;
|
||||
}
|
||||
return regions;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Determines if the window can be slid given the new incoming read.
|
||||
*
|
||||
* We check from the start of the window to the (unclipped) start of the new incoming read if there
|
||||
* is any variant.
|
||||
* If there are variant sites, we check if it's time to close the variant region.
|
||||
*
|
||||
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
|
||||
* @return all reads that have fallen to the left of the sliding window after the slide
|
||||
*/
|
||||
protected List<GATKSAMRecord> slideWindow(int incomingReadUnclippedStart) {
|
||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
if (incomingReadUnclippedStart - contextSize > getStartLocation()) {
|
||||
int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation();
|
||||
boolean[] variantSite = markSites(getStartLocation() + readStartHeaderIndex);
|
||||
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
|
||||
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, variantSite);
|
||||
finalizedReads = closeVariantRegions(regions, false);
|
||||
|
||||
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>();
|
||||
for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!)
|
||||
if (read.getAlignmentEnd() < getStartLocation()) {
|
||||
readsToRemove.add(read);
|
||||
}
|
||||
}
|
||||
for (GATKSAMRecord read : readsToRemove) {
|
||||
readsInWindow.remove(read);
|
||||
}
|
||||
}
|
||||
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* returns an array marked with variant and non-variant regions (it uses
|
||||
* markVariantRegions to make the marks)
|
||||
*
|
||||
* @param stop check the window from start to stop (not-inclusive)
|
||||
* @return a boolean array with 'true' marking variant regions and false marking consensus sites
|
||||
*/
|
||||
protected boolean[] markSites(int stop) {
|
||||
|
||||
boolean[] markedSites = new boolean[stop - getStartLocation() + contextSize + 1];
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.iterator();
|
||||
for (int i = getStartLocation(); i < stop; i++) {
|
||||
if (headerElementIterator.hasNext()) {
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
|
||||
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
|
||||
markVariantRegion(markedSites, i - getStartLocation());
|
||||
|
||||
} else
|
||||
break;
|
||||
}
|
||||
return markedSites;
|
||||
}
|
||||
|
||||
/**
|
||||
* Marks the sites around the variant site (as true)
|
||||
*
|
||||
* @param markedSites the boolean array to bear the marks
|
||||
* @param variantSiteLocation the location where a variant site was found
|
||||
*/
|
||||
protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) {
|
||||
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
|
||||
int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1;
|
||||
for (int i = from; i < to; i++)
|
||||
markedSites[i] = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds bases to the running consensus or filtered data accordingly
|
||||
*
|
||||
* If adding a sequence with gaps, it will finalize multiple consensus reads and keep the last running consensus
|
||||
*
|
||||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
* @return a list of consensus reads generated by this call. Empty list if no consensus was generated.
|
||||
*/
|
||||
protected List<GATKSAMRecord> addToSyntheticReads(int start, int end) {
|
||||
LinkedList<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
|
||||
if (start < end) {
|
||||
|
||||
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end));
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
|
||||
if (headerElement.hasConsensusData()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
|
||||
|
||||
int endOfConsensus = findNextNonConsensusElement(start, end);
|
||||
addToRunningConsensus(start, endOfConsensus);
|
||||
|
||||
if (endOfConsensus <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(endOfConsensus, end));
|
||||
} else if (headerElement.hasFilteredData()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
|
||||
|
||||
int endOfFilteredData = findNextNonFilteredDataElement(start, end);
|
||||
addToFilteredData(start, endOfFilteredData);
|
||||
|
||||
if (endOfFilteredData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(endOfFilteredData, end));
|
||||
} else if (headerElement.isEmpty()) {
|
||||
reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
int endOfEmptyData = findNextNonEmptyElement(start, end);
|
||||
|
||||
if (endOfEmptyData <= start)
|
||||
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
|
||||
|
||||
reads.addAll(addToSyntheticReads(endOfEmptyData, end));
|
||||
} else
|
||||
throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
|
||||
|
||||
}
|
||||
|
||||
return reads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalizes one or more synthetic reads.
|
||||
*
|
||||
* @param type the synthetic reads you want to close
|
||||
* @return the GATKSAMRecords generated by finalizing the synthetic reads
|
||||
*/
|
||||
private List<GATKSAMRecord> finalizeAndAdd(ConsensusType type) {
|
||||
GATKSAMRecord read = null;
|
||||
List<GATKSAMRecord> list = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
switch (type) {
|
||||
case CONSENSUS:
|
||||
read = finalizeRunningConsensus();
|
||||
break;
|
||||
case FILTERED:
|
||||
read = finalizeFilteredDataConsensus();
|
||||
break;
|
||||
case BOTH:
|
||||
read = finalizeRunningConsensus();
|
||||
if (read != null) list.add(read);
|
||||
read = finalizeFilteredDataConsensus();
|
||||
}
|
||||
if (read != null)
|
||||
list.add(read);
|
||||
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks for the next position without consensus data
|
||||
*
|
||||
* @param start beginning of the filtered region
|
||||
* @param upTo limit to search for another consensus element
|
||||
* @return next position with consensus data or empty
|
||||
*/
|
||||
private int findNextNonConsensusElement(int start, int upTo) {
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
int index = start;
|
||||
while (index < upTo) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("There are no more header elements in this window");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (!headerElement.hasConsensusData())
|
||||
break;
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks for the next position without filtered data
|
||||
*
|
||||
* @param start beginning of the region
|
||||
* @param upTo limit to search for
|
||||
* @return next position with no filtered data
|
||||
*/
|
||||
private int findNextNonFilteredDataElement(int start, int upTo) {
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
int index = start;
|
||||
while (index < upTo) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("There are no more header elements in this window");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (!headerElement.hasFilteredData() || headerElement.hasConsensusData())
|
||||
break;
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
/**
|
||||
* Looks for the next non-empty header element
|
||||
*
|
||||
* @param start beginning of the region
|
||||
* @param upTo limit to search for
|
||||
* @return next position with non-empty element
|
||||
*/
|
||||
private int findNextNonEmptyElement(int start, int upTo) {
|
||||
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
int index = start;
|
||||
while (index < upTo) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("There are no more header elements in this window");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (!headerElement.isEmpty())
|
||||
break;
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Adds bases to the filtered data synthetic read.
|
||||
*
|
||||
* Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
|
||||
* bases.
|
||||
*
|
||||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
*/
|
||||
private void addToFilteredData(int start, int end) {
|
||||
if (filteredDataConsensus == null)
|
||||
filteredDataConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
|
||||
|
||||
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
for (int index = start; index < end; index++) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (headerElement.hasConsensusData())
|
||||
throw new ReviewedStingException("Found consensus data inside region to add to filtered data.");
|
||||
|
||||
if (!headerElement.hasFilteredData())
|
||||
throw new ReviewedStingException("No filtered data in " + index);
|
||||
|
||||
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds bases to the filtered data synthetic read.
|
||||
*
|
||||
* Different from the addToConsensus method, this method assumes a contiguous sequence of filteredData
|
||||
* bases.
|
||||
*
|
||||
* @param start the first header index to add to consensus
|
||||
* @param end the first header index NOT TO add to consensus
|
||||
*/
|
||||
private void addToRunningConsensus(int start, int end) {
|
||||
if (runningConsensus == null)
|
||||
runningConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
|
||||
for (int index = start; index < end; index++) {
|
||||
if (!headerElementIterator.hasNext())
|
||||
throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
|
||||
|
||||
HeaderElement headerElement = headerElementIterator.next();
|
||||
if (!headerElement.hasConsensusData())
|
||||
throw new ReviewedStingException("No CONSENSUS data in " + index);
|
||||
|
||||
genericAddBaseToConsensus(runningConsensus, headerElement.getConsensusBaseCounts(), headerElement.getRMS());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic accessor to add base and qualities to a synthetic read
|
||||
*
|
||||
* @param syntheticRead the synthetic read to add to
|
||||
* @param baseCounts the base counts object in the header element
|
||||
* @param rms the rms mapping quality in the header element
|
||||
*/
|
||||
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
|
||||
BaseIndex base = baseCounts.baseIndexWithMostCounts();
|
||||
byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE);
|
||||
byte qual = baseCounts.averageQualsOfMostCommonBase();
|
||||
byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase();
|
||||
byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase();
|
||||
syntheticRead.add(base, count, qual, insQual, delQual, rms);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalizes a variant region, any adjacent synthetic reads.
|
||||
*
|
||||
* @param start the first window header index in the variant region (inclusive)
|
||||
* @param stop the last window header index of the variant region (inclusive)
|
||||
* @return all reads contained in the variant region plus any adjacent synthetic reads
|
||||
*/
|
||||
@Requires("start <= stop")
|
||||
protected List<GATKSAMRecord> closeVariantRegion(int start, int stop) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
int refStart = windowHeader.get(start).getLocation(); // All operations are reference based, not read based
|
||||
int refStop = windowHeader.get(stop).getLocation();
|
||||
|
||||
for (GATKSAMRecord read : readsInWindow) { // Keep all reads that overlap the variant region
|
||||
if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) {
|
||||
allReads.add(read);
|
||||
updateHeaderCounts(read, true); // Remove this read from the window header entirely
|
||||
}
|
||||
}
|
||||
|
||||
List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
|
||||
result.addAll(addToSyntheticReads(0, start));
|
||||
result.addAll(finalizeAndAdd(ConsensusType.BOTH));
|
||||
|
||||
for (GATKSAMRecord read : result) {
|
||||
readsInWindow.remove(read); // todo -- not optimal, but needs to be done so the next region doesn't try to remove the same reads from the header counts.
|
||||
}
|
||||
|
||||
return result; // finalized reads will be downsampled if necessary
|
||||
}
|
||||
|
||||
|
||||
private List<GATKSAMRecord> closeVariantRegions(List<Pair<Integer, Integer>> regions, boolean forceClose) {
|
||||
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
|
||||
if (!regions.isEmpty()) {
|
||||
int lastStop = -1;
|
||||
for (Pair<Integer, Integer> region : regions) {
|
||||
int start = region.getFirst();
|
||||
int stop = region.getSecond();
|
||||
if (stop < 0 && forceClose)
|
||||
stop = windowHeader.size() - 1;
|
||||
if (stop >= 0) {
|
||||
allReads.addAll(closeVariantRegion(start, stop));
|
||||
lastStop = stop;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < lastStop; i++) // clean up the window header elements up until the end of the variant region. (we keep the last element in case the following element had a read that started with insertion)
|
||||
windowHeader.remove(); // todo -- can't believe java doesn't allow me to just do windowHeader = windowHeader.get(stop). Should be more efficient here!
|
||||
}
|
||||
return allReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* Downsamples a variant region to the downsample coverage of the sliding window.
|
||||
*
|
||||
* It will use the downsampling strategy defined by the SlidingWindow
|
||||
*
|
||||
* @param allReads the reads to select from (all reads that cover the window)
|
||||
* @return a list of reads selected by the downsampler to cover the window to at least the desired coverage
|
||||
*/
|
||||
protected List<GATKSAMRecord> downsampleVariantRegion(final List<GATKSAMRecord> allReads) {
|
||||
double fraction = 100 / allReads.size();
|
||||
if (fraction >= 1)
|
||||
return allReads;
|
||||
|
||||
FractionalDownsampler <GATKSAMRecord> downsampler = new FractionalDownsampler<GATKSAMRecord>(fraction);
|
||||
downsampler.submit(allReads);
|
||||
return downsampler.consumeDownsampledItems();
|
||||
}
|
||||
|
||||
/**
|
||||
* Properly closes a Sliding Window, finalizing all consensus and variant
|
||||
* regions that still exist regardless of being able to fulfill the
|
||||
* context size requirement in the end.
|
||||
*
|
||||
* @return All reads generated
|
||||
*/
|
||||
public List<GATKSAMRecord> close() {
|
||||
// mark variant regions
|
||||
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
boolean[] variantSite = markSites(stopLocation + 1);
|
||||
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), variantSite);
|
||||
finalizedReads = closeVariantRegions(regions, true);
|
||||
|
||||
if (!windowHeader.isEmpty()) {
|
||||
finalizedReads.addAll(addToSyntheticReads(0, windowHeader.size() - 1));
|
||||
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
|
||||
}
|
||||
|
||||
}
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
/**
|
||||
* generates the SAM record for the running consensus read and resets it (to null)
|
||||
*
|
||||
* @return the read contained in the running consensus
|
||||
*/
|
||||
protected GATKSAMRecord finalizeRunningConsensus() {
|
||||
GATKSAMRecord finalizedRead = null;
|
||||
if (runningConsensus != null) {
|
||||
if (runningConsensus.size() > 0)
|
||||
finalizedRead = runningConsensus.close();
|
||||
else
|
||||
consensusCounter--;
|
||||
|
||||
runningConsensus = null;
|
||||
}
|
||||
return finalizedRead;
|
||||
}
|
||||
|
||||
/**
|
||||
* generates the SAM record for the filtered data consensus and resets it (to null)
|
||||
*
|
||||
* @return the read contained in the running consensus
|
||||
*/
|
||||
protected GATKSAMRecord finalizeFilteredDataConsensus() {
|
||||
GATKSAMRecord finalizedRead = null;
|
||||
if (filteredDataConsensus != null) {
|
||||
if (filteredDataConsensus.size() > 0)
|
||||
finalizedRead = filteredDataConsensus.close();
|
||||
else
|
||||
filteredDataConsensusCounter--;
|
||||
|
||||
filteredDataConsensus = null;
|
||||
}
|
||||
return finalizedRead;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Updates the sliding window's header counts with the incoming read bases, insertions
|
||||
* and deletions.
|
||||
*
|
||||
* @param read the incoming read to be added to the sliding window
|
||||
*/
|
||||
protected void updateHeaderCounts(GATKSAMRecord read, boolean removeRead) {
|
||||
byte[] bases = read.getReadBases();
|
||||
byte[] quals = read.getBaseQualities();
|
||||
byte[] insQuals = read.getExistingBaseInsertionQualities();
|
||||
byte[] delQuals = read.getExistingBaseDeletionQualities();
|
||||
int readStart = read.getSoftStart();
|
||||
int readEnd = read.getSoftEnd();
|
||||
Cigar cigar = read.getCigar();
|
||||
|
||||
int readBaseIndex = 0;
|
||||
int startLocation = getStartLocation();
|
||||
int locationIndex = startLocation < 0 ? 0 : readStart - startLocation;
|
||||
|
||||
if (removeRead && locationIndex < 0)
|
||||
throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation);
|
||||
|
||||
if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it
|
||||
if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window
|
||||
for (int i = 1; i <= -locationIndex; i++)
|
||||
windowHeader.addFirst(new HeaderElement(startLocation - i));
|
||||
|
||||
startLocation = readStart; // update start location accordingly
|
||||
locationIndex = 0;
|
||||
}
|
||||
|
||||
if (stopLocation < readEnd) { // Do we need to add extra elements to the header?
|
||||
int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation;
|
||||
while (elementsToAdd-- > 0)
|
||||
windowHeader.addLast(new HeaderElement(readEnd - elementsToAdd));
|
||||
|
||||
stopLocation = readEnd; // update stopLocation accordingly
|
||||
}
|
||||
|
||||
// Special case for leading insertions before the beginning of the sliding read
|
||||
if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) {
|
||||
windowHeader.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added
|
||||
locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing.
|
||||
}
|
||||
}
|
||||
|
||||
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(locationIndex);
|
||||
HeaderElement headerElement;
|
||||
for (CigarElement cigarElement : cigar.getCigarElements()) {
|
||||
switch (cigarElement.getOperator()) {
|
||||
case H:
|
||||
break;
|
||||
case I:
|
||||
if (removeRead && locationIndex == 0) { // special case, if we are removing a read that starts in insertion and we don't have the previous header element anymore, don't worry about it.
|
||||
break;
|
||||
}
|
||||
|
||||
headerElement = windowHeader.get(locationIndex - 1); // insertions are added to the base to the left (previous element)
|
||||
|
||||
if (removeRead) {
|
||||
headerElement.removeInsertionToTheRight();
|
||||
}
|
||||
else {
|
||||
headerElement.addInsertionToTheRight();
|
||||
}
|
||||
readBaseIndex += cigarElement.getLength();
|
||||
break; // just ignore the insertions at the beginning of the read
|
||||
case D:
|
||||
int nDeletions = cigarElement.getLength();
|
||||
while (nDeletions-- > 0) { // deletions are added to the baseCounts with the read mapping quality as it's quality score
|
||||
headerElement = headerElementIterator.next();
|
||||
byte mq = (byte) read.getMappingQuality();
|
||||
if (removeRead)
|
||||
headerElement.removeBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
|
||||
else
|
||||
headerElement.addBase((byte) 'D', mq, mq, mq, mq, MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, false);
|
||||
|
||||
locationIndex++;
|
||||
}
|
||||
break;
|
||||
case S:
|
||||
case M:
|
||||
case P:
|
||||
case EQ:
|
||||
case X:
|
||||
int nBasesToAdd = cigarElement.getLength();
|
||||
while (nBasesToAdd-- > 0) {
|
||||
headerElement = headerElementIterator.next();
|
||||
byte insertionQuality = insQuals == null ? -1 : insQuals[readBaseIndex]; // if the read doesn't have indel qualities, use -1 (doesn't matter the value because it won't be used for anything)
|
||||
byte deletionQuality = delQuals == null ? -1 : delQuals[readBaseIndex];
|
||||
if (removeRead)
|
||||
headerElement.removeBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
|
||||
else
|
||||
headerElement.addBase(bases[readBaseIndex], quals[readBaseIndex], insertionQuality, deletionQuality, read.getMappingQuality(), MIN_BASE_QUAL_TO_COUNT, MIN_MAPPING_QUALITY, cigarElement.getOperator() == CigarOperator.S);
|
||||
|
||||
readBaseIndex++;
|
||||
locationIndex++;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,285 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import net.sf.samtools.CigarOperator;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.gatk.walkers.bqsr.EventType;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Running Consensus is a read that is compressed as a sliding window travels over the reads
|
||||
* and keeps track of all the bases that are outside of variant regions.
|
||||
*
|
||||
* Consensus reads have qual fields that correspond to the number of reads that had the base
|
||||
* and passed the minimum quality threshold.
|
||||
*
|
||||
* The mapping quality of a consensus read is the average RMS of the mapping qualities of all reads
|
||||
* that compose the consensus
|
||||
*
|
||||
* @author Mauricio Carneiro
|
||||
* @since 8/26/11
|
||||
*/
|
||||
public class SyntheticRead {
|
||||
private List<BaseIndex> bases;
|
||||
private List<Byte> counts;
|
||||
private List<Byte> quals;
|
||||
private List<Byte> insertionQuals;
|
||||
private List<Byte> deletionQuals;
|
||||
private double mappingQuality; // the average of the rms of the mapping qualities of all the reads that contributed to this consensus
|
||||
private String readTag;
|
||||
|
||||
// Information to produce a GATKSAMRecord
|
||||
private SAMFileHeader header;
|
||||
private GATKSAMReadGroupRecord readGroupRecord;
|
||||
private String contig;
|
||||
private int contigIndex;
|
||||
private String readName;
|
||||
private Integer refStart;
|
||||
private boolean hasIndelQualities = false;
|
||||
|
||||
/**
|
||||
* Full initialization of the running consensus if you have all the information and are ready to
|
||||
* start adding to the running consensus.
|
||||
*
|
||||
* @param header GATKSAMRecord file header
|
||||
* @param readGroupRecord Read Group for the GATKSAMRecord
|
||||
* @param contig the read's contig name
|
||||
* @param contigIndex the read's contig index
|
||||
* @param readName the read's name
|
||||
* @param refStart the alignment start (reference based)
|
||||
* @param readTag the reduce reads tag for the synthetic read
|
||||
*/
|
||||
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities) {
|
||||
final int initialCapacity = 10000;
|
||||
bases = new ArrayList<BaseIndex>(initialCapacity);
|
||||
counts = new ArrayList<Byte>(initialCapacity);
|
||||
quals = new ArrayList<Byte>(initialCapacity);
|
||||
insertionQuals = new ArrayList<Byte>(initialCapacity);
|
||||
deletionQuals = new ArrayList<Byte>(initialCapacity);
|
||||
mappingQuality = 0.0;
|
||||
|
||||
this.readTag = readTag;
|
||||
this.header = header;
|
||||
this.readGroupRecord = readGroupRecord;
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
}
|
||||
|
||||
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities) {
|
||||
this.bases = bases;
|
||||
this.counts = counts;
|
||||
this.quals = quals;
|
||||
this.insertionQuals = insertionQuals;
|
||||
this.deletionQuals = deletionQuals;
|
||||
this.mappingQuality = mappingQuality;
|
||||
this.readTag = readTag;
|
||||
this.header = header;
|
||||
this.readGroupRecord = readGroupRecord;
|
||||
this.contig = contig;
|
||||
this.contigIndex = contigIndex;
|
||||
this.readName = readName;
|
||||
this.refStart = refStart;
|
||||
this.hasIndelQualities = hasIndelQualities;
|
||||
}
|
||||
|
||||
/**
|
||||
* Easy access to keep adding to a running consensus that has already been
|
||||
* initialized with the correct read name and refStart
|
||||
*
|
||||
* @param base the base to add
|
||||
* @param count number of reads with this base
|
||||
*/
|
||||
@Requires("count < Byte.MAX_VALUE")
|
||||
public void add(BaseIndex base, byte count, byte qual, byte insQual, byte delQual, double mappingQuality) {
|
||||
counts.add(count);
|
||||
bases.add(base);
|
||||
quals.add(qual);
|
||||
insertionQuals.add(insQual);
|
||||
deletionQuals.add(delQual);
|
||||
this.mappingQuality += mappingQuality;
|
||||
}
|
||||
|
||||
public BaseIndex getBase(int readCoordinate) {
|
||||
return bases.get(readCoordinate);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
|
||||
*
|
||||
* Invalid reads are :
|
||||
* - exclusively composed of deletions
|
||||
*
|
||||
* @return a GATKSAMRecord or null
|
||||
*/
|
||||
public GATKSAMRecord close () {
|
||||
if (isAllDeletions())
|
||||
return null;
|
||||
|
||||
GATKSAMRecord read = new GATKSAMRecord(header);
|
||||
read.setReferenceName(contig);
|
||||
read.setReferenceIndex(contigIndex);
|
||||
read.setReadPairedFlag(false);
|
||||
read.setReadUnmappedFlag(false);
|
||||
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
|
||||
read.setAlignmentStart(refStart);
|
||||
read.setReadName(readName);
|
||||
read.setBaseQualities(convertBaseQualities(), EventType.BASE_SUBSTITUTION);
|
||||
read.setReadBases(convertReadBases());
|
||||
read.setMappingQuality((int) Math.ceil(mappingQuality / bases.size()));
|
||||
read.setReadGroup(readGroupRecord);
|
||||
read.setAttribute(readTag, convertBaseCounts());
|
||||
|
||||
if (hasIndelQualities) {
|
||||
read.setBaseQualities(convertInsertionQualities(), EventType.BASE_INSERTION);
|
||||
read.setBaseQualities(convertDeletionQualities(), EventType.BASE_DELETION);
|
||||
}
|
||||
|
||||
return read;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the synthetic read is composed exclusively of deletions
|
||||
*
|
||||
* @return true if it is, false if it isn't.
|
||||
*/
|
||||
private boolean isAllDeletions() {
|
||||
for (BaseIndex b : bases)
|
||||
if (b != BaseIndex.D)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
public int size () {
|
||||
return bases.size();
|
||||
}
|
||||
|
||||
private byte [] convertBaseQualities() {
|
||||
return convertVariableGivenBases(bases, quals);
|
||||
}
|
||||
|
||||
private byte [] convertInsertionQualities() {
|
||||
return convertVariableGivenBases(bases, insertionQuals);
|
||||
}
|
||||
|
||||
private byte [] convertDeletionQualities() {
|
||||
return convertVariableGivenBases(bases, deletionQuals);
|
||||
}
|
||||
|
||||
protected byte [] convertBaseCounts() {
|
||||
byte[] countsArray = convertVariableGivenBases(bases, counts);
|
||||
|
||||
if (countsArray.length == 0)
|
||||
throw new ReviewedStingException("Reduced read has counts array of length 0");
|
||||
|
||||
byte[] compressedCountsArray = new byte [countsArray.length];
|
||||
compressedCountsArray[0] = countsArray[0];
|
||||
for (int i = 1; i < countsArray.length; i++)
|
||||
compressedCountsArray[i] = (byte) MathUtils.bound(countsArray[i] - compressedCountsArray[0], Byte.MIN_VALUE, Byte.MAX_VALUE);
|
||||
|
||||
return compressedCountsArray;
|
||||
}
|
||||
|
||||
private byte [] convertReadBases() {
|
||||
byte [] readArray = new byte[getReadLengthWithNoDeletions(bases)];
|
||||
int i = 0;
|
||||
for (BaseIndex baseIndex : bases)
|
||||
if (baseIndex != BaseIndex.D)
|
||||
readArray[i++] = baseIndex.getByte();
|
||||
|
||||
return readArray;
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds the cigar string for the synthetic read
|
||||
*
|
||||
* Warning: if the synthetic read has leading deletions, it will shift the refStart (alignment start) of the read.
|
||||
*
|
||||
* @return the cigar string for the synthetic read
|
||||
*/
|
||||
private Cigar buildCigar() {
|
||||
LinkedList<CigarElement> cigarElements = new LinkedList<CigarElement>();
|
||||
CigarOperator cigarOperator = null;
|
||||
int length = 0;
|
||||
for (BaseIndex b : bases) {
|
||||
CigarOperator op;
|
||||
switch (b) {
|
||||
case D:
|
||||
op = CigarOperator.DELETION;
|
||||
break;
|
||||
case I:
|
||||
throw new ReviewedStingException("Trying to create an insertion in a synthetic read. This operation is currently unsupported.");
|
||||
default:
|
||||
op = CigarOperator.MATCH_OR_MISMATCH;
|
||||
break;
|
||||
}
|
||||
if (cigarOperator == null) {
|
||||
if (op == CigarOperator.D) // read cannot start with a deletion
|
||||
refStart++; // if it does, we need to move the reference start forward
|
||||
else
|
||||
cigarOperator = op;
|
||||
}
|
||||
else if (cigarOperator != op) { // if this is a new operator, we need to close the previous one
|
||||
cigarElements.add(new CigarElement(length, cigarOperator)); // close previous operator
|
||||
cigarOperator = op;
|
||||
length = 0;
|
||||
}
|
||||
|
||||
if (cigarOperator != null) // only increment the length of the cigar element if we really added it to the read (no leading deletions)
|
||||
length++;
|
||||
}
|
||||
if (length > 0 && cigarOperator != CigarOperator.D) // read cannot end with a deletion
|
||||
cigarElements.add(new CigarElement(length, cigarOperator)); // add the last cigar element
|
||||
|
||||
return new Cigar(cigarElements);
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared functionality for all conversion utilities
|
||||
*
|
||||
* @param bases the read bases
|
||||
* @param variable the list to convert
|
||||
* @return a converted variable given the bases and skipping deletions
|
||||
*/
|
||||
|
||||
private static byte [] convertVariableGivenBases (List<BaseIndex> bases, List<Byte> variable) {
|
||||
byte [] variableArray = new byte[getReadLengthWithNoDeletions(bases)];
|
||||
int i = 0;
|
||||
Iterator<Byte> variableIterator = variable.iterator();
|
||||
for (BaseIndex baseIndex : bases) {
|
||||
byte count = variableIterator.next();
|
||||
if (baseIndex != BaseIndex.D)
|
||||
variableArray[i++] = count;
|
||||
}
|
||||
return variableArray;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Shared functionality for all conversion utilities
|
||||
*
|
||||
* @param bases the read bases
|
||||
* @return the length of the read with no deletions
|
||||
*/
|
||||
private static int getReadLengthWithNoDeletions(List<BaseIndex> bases) {
|
||||
int readLength = bases.size();
|
||||
for (BaseIndex baseIndex : bases)
|
||||
if (baseIndex == BaseIndex.D)
|
||||
readLength--;
|
||||
return readLength;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,242 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: carneiro
|
||||
* Date: 7/21/11
|
||||
* Time: 2:21 PM
|
||||
*
|
||||
* This is a site based implementation of an Error Model. The error model is a probability
|
||||
* distribution for the site given the phred scaled quality.
|
||||
*/
|
||||
public class ErrorModel {
|
||||
private byte maxQualityScore;
|
||||
private byte minQualityScore;
|
||||
private byte phredScaledPrior;
|
||||
private double log10minPower;
|
||||
private int refDepth;
|
||||
private boolean hasData = false;
|
||||
private ProbabilityVector probabilityVector;
|
||||
private static final boolean compressRange = false;
|
||||
|
||||
private static final double log10MinusE = Math.log10(Math.exp(1.0));
|
||||
|
||||
/**
|
||||
* Calculates the probability of the data (reference sample reads) given the phred scaled site quality score.
|
||||
*
|
||||
* @param minQualityScore Minimum site quality score to evaluate
|
||||
* @param maxQualityScore Maximum site quality score to evaluate
|
||||
* @param phredScaledPrior Prior for site quality
|
||||
* @param refSamplePileup Reference sample pileup
|
||||
* @param refSampleVC VC with True alleles in reference sample pileup
|
||||
* @param minPower Minimum power
|
||||
*/
|
||||
public ErrorModel (byte minQualityScore, byte maxQualityScore, byte phredScaledPrior,
|
||||
ReadBackedPileup refSamplePileup, VariantContext refSampleVC, double minPower) {
|
||||
this.maxQualityScore = maxQualityScore;
|
||||
this.minQualityScore = minQualityScore;
|
||||
this.phredScaledPrior = phredScaledPrior;
|
||||
log10minPower = Math.log10(minPower);
|
||||
|
||||
|
||||
double[] model = new double[maxQualityScore+1];
|
||||
Arrays.fill(model,Double.NEGATIVE_INFINITY);
|
||||
|
||||
boolean hasCalledAlleles = false;
|
||||
if (refSampleVC != null) {
|
||||
|
||||
for (Allele allele : refSampleVC.getAlleles()) {
|
||||
if (allele.isCalled()) {
|
||||
hasCalledAlleles = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if (refSamplePileup == null || refSampleVC == null || !hasCalledAlleles) {
|
||||
double p = MathUtils.phredScaleToLog10Probability((byte)(maxQualityScore-minQualityScore));
|
||||
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
|
||||
// maximum uncertainty if there's no ref data at site
|
||||
model[q] = p;
|
||||
}
|
||||
this.refDepth = 0;
|
||||
}
|
||||
else {
|
||||
hasData = true;
|
||||
int matches = 0;
|
||||
int coverage = refSamplePileup.getNumberOfElements();
|
||||
|
||||
Allele refAllele = refSampleVC.getReference();
|
||||
|
||||
for (PileupElement refPileupElement : refSamplePileup) {
|
||||
boolean isMatch = false;
|
||||
for (Allele allele : refSampleVC.getAlleles())
|
||||
isMatch |= pileupElementMatches(refPileupElement, allele, refAllele);
|
||||
|
||||
matches += (isMatch?1:0);
|
||||
// System.out.format("MATCH:%b\n",isMatch);
|
||||
}
|
||||
|
||||
int mismatches = coverage - matches;
|
||||
//System.out.format("Cov:%d match:%d mismatch:%d\n",coverage, matches, mismatches);
|
||||
for (byte q=minQualityScore; q<=maxQualityScore; q++) {
|
||||
model[q] = log10PoissonProbabilitySiteGivenQual(q,coverage, mismatches);
|
||||
}
|
||||
this.refDepth = coverage;
|
||||
}
|
||||
|
||||
// compress probability vector
|
||||
this.probabilityVector = new ProbabilityVector(model, compressRange);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple constructor that just takes a given log-probability vector as error model.
|
||||
* Only intended for unit testing, not general usage.
|
||||
* @param pvector Given vector of log-probabilities
|
||||
*
|
||||
*/
|
||||
public ErrorModel(double[] pvector) {
|
||||
this.maxQualityScore = (byte)(pvector.length-1);
|
||||
this.minQualityScore = 0;
|
||||
this.probabilityVector = new ProbabilityVector(pvector, compressRange);
|
||||
this.hasData = true;
|
||||
|
||||
}
|
||||
|
||||
public static boolean pileupElementMatches(PileupElement pileupElement, Allele allele, Allele refAllele) {
|
||||
/* System.out.format("PE: base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d Allele:%s RefAllele:%s\n",
|
||||
pileupElement.getBase(), pileupElement.isBeforeDeletionStart(),
|
||||
pileupElement.isBeforeInsertion(),pileupElement.getEventBases(),pileupElement.getEventLength(), allele.toString(), refAllele.toString());
|
||||
*/
|
||||
|
||||
// if test allele is ref, any base mismatch, or any insertion/deletion at start of pileup count as mismatch
|
||||
if (allele.isReference()) {
|
||||
// for a ref allele, any base mismatch or new indel is a mismatch.
|
||||
if(allele.getBases().length>0 && allele.getBases().length == refAllele.getBases().length ) // SNP/MNP case
|
||||
return (/*!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart() &&*/ pileupElement.getBase() == allele.getBases()[0]);
|
||||
else
|
||||
// either null allele to compare, or ref/alt lengths are different (indel by definition).
|
||||
// if we have an indel that we are comparing against a REF allele, any indel presence (of any length/content) is a mismatch
|
||||
return (!pileupElement.isBeforeInsertion() && !pileupElement.isBeforeDeletionStart());
|
||||
}
|
||||
|
||||
if (refAllele.getBases().length == allele.getBases().length)
|
||||
// alleles have the same length (eg snp or mnp)
|
||||
return pileupElement.getBase() == allele.getBases()[0];
|
||||
|
||||
// for non-ref alleles,
|
||||
byte[] alleleBases = allele.getBases();
|
||||
int eventLength = alleleBases.length - refAllele.getBases().length;
|
||||
if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength)
|
||||
return true;
|
||||
|
||||
if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
|
||||
Arrays.equals(pileupElement.getEventBases().getBytes(),alleleBases))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* What's the log-likelihood that a site's quality is equal to q? If we see N observations and n mismatches,
|
||||
* and assuming each match is independent of each other and that the match probability is just dependent of
|
||||
* the site quality, so p = 10.^-q/10.
|
||||
* Since we'll normally have relatively high Q sites and deep coverage in reference samples (ie p small, N high),
|
||||
* to avoid underflows we'll use the Poisson approximation with lambda = N*p.
|
||||
* Hence, the log-likelihood of q i.e. Pr(Nmismatches = n | SiteQ = q) ~ Poisson(n | lambda = p*N) with p as above.
|
||||
* @param q Desired q to get likelihood from
|
||||
* @param coverage Total coverage
|
||||
* @param mismatches Number of mismatches
|
||||
* @return Likelihood of observations as a function of q
|
||||
*/
|
||||
@Requires({
|
||||
"q >= minQualityScore",
|
||||
"q <= maxQualityScore",
|
||||
"coverage >= 0",
|
||||
"mismatches >= 0",
|
||||
"mismatches <= coverage"
|
||||
})
|
||||
private double log10PoissonProbabilitySiteGivenQual(byte q, int coverage, int mismatches) {
|
||||
// same as log10ProbabilitySiteGivenQual but with Poisson approximation to avoid numerical underflows
|
||||
double lambda = MathUtils.phredScaleToProbability(q) * (double )coverage;
|
||||
// log10(e^-lambda*lambda^k/k!) = -lambda + k*log10(lambda) - log10factorial(k)
|
||||
return Math.log10(lambda)*mismatches - lambda*log10MinusE- MathUtils.log10Factorial(mismatches);
|
||||
}
|
||||
|
||||
@Requires({"qual-minQualityScore <= maxQualityScore"})
|
||||
public double getSiteLogErrorProbabilityGivenQual (int qual) {
|
||||
return probabilityVector.getLogProbabilityForIndex(qual);
|
||||
}
|
||||
|
||||
public byte getMaxQualityScore() {
|
||||
return maxQualityScore;
|
||||
}
|
||||
|
||||
public byte getMinQualityScore() {
|
||||
return minQualityScore;
|
||||
}
|
||||
|
||||
public int getMinSignificantQualityScore() {
|
||||
return new ProbabilityVector(probabilityVector,true).getMinVal();
|
||||
}
|
||||
|
||||
public int getMaxSignificantQualityScore() {
|
||||
return new ProbabilityVector(probabilityVector,true).getMaxVal();
|
||||
}
|
||||
|
||||
public int getReferenceDepth() {
|
||||
return refDepth;
|
||||
}
|
||||
public boolean hasData() {
|
||||
return hasData;
|
||||
}
|
||||
|
||||
public ProbabilityVector getErrorModelVector() {
|
||||
return probabilityVector;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
String result = "(";
|
||||
boolean skipComma = true;
|
||||
for (double v : probabilityVector.getProbabilityVector()) {
|
||||
if (skipComma) {
|
||||
skipComma = false;
|
||||
}
|
||||
else {
|
||||
result += ",";
|
||||
}
|
||||
result += String.format("%.4f", v);
|
||||
}
|
||||
return result + ")";
|
||||
}
|
||||
|
||||
public static int getTotalReferenceDepth(HashMap<String, ErrorModel> perLaneErrorModels) {
|
||||
int n=0;
|
||||
for (ErrorModel e : perLaneErrorModels.values()) {
|
||||
n += e.getReferenceDepth();
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
/*
|
||||
@Requires({"maxAlleleCount >= 0"})
|
||||
//todo -- memoize this function
|
||||
public boolean hasPowerForMaxAC (int maxAlleleCount) {
|
||||
int siteQ = (int) Math.ceil(MathUtils.probabilityToPhredScale((double) 1/maxAlleleCount));
|
||||
double log10CumSum = getCumulativeSum(siteQ);
|
||||
return log10CumSum < log10minPower;
|
||||
} */
|
||||
}
|
||||
|
|
@ -0,0 +1,706 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
public class PoolAFCalculationModel extends AlleleFrequencyCalculationModel {
|
||||
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
private final int ploidy;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
private final static boolean VERBOSE = false;
|
||||
|
||||
protected PoolAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) {
|
||||
super(UAC, N, logger, verboseWriter);
|
||||
ploidy = UAC.samplePloidy;
|
||||
this.UAC = UAC;
|
||||
|
||||
}
|
||||
|
||||
public List<Allele> getLog10PNonRef(final VariantContext vc,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
GenotypesContext GLs = vc.getGenotypes();
|
||||
List<Allele> alleles = vc.getAlleles();
|
||||
|
||||
// don't try to genotype too many alternate alleles
|
||||
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) {
|
||||
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
|
||||
|
||||
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1);
|
||||
alleles.add(vc.getReference());
|
||||
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy));
|
||||
|
||||
|
||||
GLs = subsetAlleles(vc, alleles, false, ploidy);
|
||||
}
|
||||
|
||||
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
|
||||
|
||||
return alleles;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple wrapper class to hold values of combined pool likelihoods.
|
||||
* For fast hashing and fast retrieval, there's a hash map that shadows main list.
|
||||
*
|
||||
*/
|
||||
static class CombinedPoolLikelihoods {
|
||||
private LinkedList<ExactACset> alleleCountSetList;
|
||||
private HashMap<ExactACcounts,ExactACset> conformationMap;
|
||||
private double maxLikelihood;
|
||||
|
||||
|
||||
public CombinedPoolLikelihoods() {
|
||||
// final int numElements = GenotypeLikelihoods.numLikelihoods();
|
||||
alleleCountSetList = new LinkedList<ExactACset>();
|
||||
conformationMap = new HashMap<ExactACcounts,ExactACset>();
|
||||
maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
|
||||
public void add(ExactACset set) {
|
||||
alleleCountSetList.add(set);
|
||||
conformationMap.put(set.ACcounts, set);
|
||||
final double likelihood = set.log10Likelihoods[0];
|
||||
|
||||
if (likelihood > maxLikelihood )
|
||||
maxLikelihood = likelihood;
|
||||
|
||||
}
|
||||
|
||||
public boolean hasConformation(int[] ac) {
|
||||
return conformationMap.containsKey(new ExactACcounts(ac));
|
||||
|
||||
}
|
||||
|
||||
public double getLikelihoodOfConformation(int[] ac) {
|
||||
return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0];
|
||||
}
|
||||
|
||||
public double getGLOfACZero() {
|
||||
return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return alleleCountSetList.size();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* Chooses N most likely alleles in a set of pools (samples) based on GL sum over alt alleles
|
||||
* @param vc Input variant context
|
||||
* @param numAllelesToChoose Number of alleles to choose
|
||||
* @param ploidy Ploidy per pool
|
||||
* @return list of numAllelesToChoose most likely alleles
|
||||
*/
|
||||
|
||||
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) {
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
|
||||
for ( int i = 0; i < numOriginalAltAlleles; i++ )
|
||||
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
|
||||
|
||||
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
|
||||
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes());
|
||||
for ( final double[] likelihoods : GLs ) {
|
||||
|
||||
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
|
||||
final int[] acCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(1+numOriginalAltAlleles,ploidy,PLindexOfBestGL);
|
||||
// by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
|
||||
for (int k=1; k < acCount.length;k++) {
|
||||
if (acCount[k] > 0)
|
||||
likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL];
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// sort them by probability mass and choose the best ones
|
||||
Collections.sort(Arrays.asList(likelihoodSums));
|
||||
final ArrayList<Allele> bestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( int i = 0; i < numAllelesToChoose; i++ )
|
||||
bestAlleles.add(likelihoodSums[i].allele);
|
||||
|
||||
final ArrayList<Allele> orderedBestAlleles = new ArrayList<Allele>(numAllelesToChoose);
|
||||
for ( Allele allele : vc.getAlternateAlleles() ) {
|
||||
if ( bestAlleles.contains(allele) )
|
||||
orderedBestAlleles.add(allele);
|
||||
}
|
||||
|
||||
return orderedBestAlleles;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Simple non-optimized version that combines GLs from several pools and produces global AF distribution.
|
||||
* @param GLs Inputs genotypes context with per-pool GLs
|
||||
* @param numAlleles Number of alternate alleles
|
||||
* @param ploidyPerPool Number of samples per pool
|
||||
* @param log10AlleleFrequencyPriors Frequency priors
|
||||
* @param result object to fill with output values
|
||||
*/
|
||||
protected static void combineSinglePools(final GenotypesContext GLs,
|
||||
final int numAlleles,
|
||||
final int ploidyPerPool,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs);
|
||||
|
||||
|
||||
int combinedPloidy = 0;
|
||||
|
||||
// Combine each pool incrementally - likelihoods will be renormalized at each step
|
||||
CombinedPoolLikelihoods combinedPoolLikelihoods = new CombinedPoolLikelihoods();
|
||||
|
||||
// first element: zero ploidy, e.g. trivial degenerate distribution
|
||||
final int[] zeroCounts = new int[numAlleles];
|
||||
final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
set.log10Likelihoods[0] = 0.0;
|
||||
|
||||
combinedPoolLikelihoods.add(set);
|
||||
for (int p=1; p<genotypeLikelihoods.size(); p++) {
|
||||
result.reset();
|
||||
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool,
|
||||
numAlleles, log10AlleleFrequencyPriors, result);
|
||||
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
|
||||
}
|
||||
}
|
||||
|
||||
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
|
||||
|
||||
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>();
|
||||
final CombinedPoolLikelihoods newPool = new CombinedPoolLikelihoods();
|
||||
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[numAlleles];
|
||||
final int newPloidy = originalPloidy + newGLPloidy;
|
||||
zeroCounts[0] = newPloidy;
|
||||
|
||||
ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLog10L, ACqueue, indexesToACset);
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
|
||||
}
|
||||
return newPool;
|
||||
}
|
||||
|
||||
// todo - refactor, function almost identical except for log10LofK computation in PoolGenotypeLikelihoods
|
||||
/**
|
||||
*
|
||||
* @param set ExactACset holding conformation to be computed
|
||||
* @param newPool New pool likelihood holder
|
||||
* @param originalPool Original likelihood holder
|
||||
* @param newGL New pool GL vector to combine
|
||||
* @param log10AlleleFrequencyPriors Prior object
|
||||
* @param originalPloidy Total ploidy of original combined pool
|
||||
* @param newGLPloidy Ploidy of GL vector
|
||||
* @param result AFResult object
|
||||
* @param maxLog10L max likelihood observed so far
|
||||
* @param ACqueue Queue of conformations to compute
|
||||
* @param indexesToACset AC indices of objects in queue
|
||||
* @return max log likelihood
|
||||
*/
|
||||
private static double calculateACConformationAndUpdateQueue(final ExactACset set,
|
||||
final CombinedPoolLikelihoods newPool,
|
||||
final CombinedPoolLikelihoods originalPool,
|
||||
final double[] newGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int originalPloidy,
|
||||
final int newGLPloidy,
|
||||
final AlleleFrequencyCalculationResult result,
|
||||
final double maxLog10L,
|
||||
final LinkedList<ExactACset> ACqueue,
|
||||
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
|
||||
|
||||
// compute likeihood in "set" of new set based on original likelihoods
|
||||
final int numAlleles = set.ACcounts.counts.length;
|
||||
final int newPloidy = set.getACsum();
|
||||
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result);
|
||||
|
||||
|
||||
// add to new pool
|
||||
if (!Double.isInfinite(log10LofK))
|
||||
newPool.add(set);
|
||||
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
// so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
|
||||
final int ACwiggle = set.ACcounts.counts[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < numAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
ACcountsClone[0] = newPloidy - altSum;
|
||||
if (ACcountsClone[0] < 0)
|
||||
continue;
|
||||
|
||||
|
||||
PoolGenotypeLikelihoods.updateACset(ACcountsClone, ACqueue, indexesToACset);
|
||||
}
|
||||
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Naive combiner of two multiallelic pools - number of alt alleles must be the same.
|
||||
* Math is generalization of biallelic combiner.
|
||||
*
|
||||
* For vector K representing an allele count conformation,
|
||||
* Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
|
||||
* where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
|
||||
* @param originalPool First log-likelihood pool GL vector
|
||||
* @param yy Second pool GL vector
|
||||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param numAlleles Number of alleles
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
*/
|
||||
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
/*
|
||||
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
|
||||
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
|
||||
|
||||
if (dim1 != originalPool.getLength() || dim2 != yy.length)
|
||||
throw new ReviewedStingException("BUG: Inconsistent vector length");
|
||||
|
||||
if (ploidy2 == 0)
|
||||
return;
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
// Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
|
||||
// and L2(K) = Pr(D|AC2=K) * choose(m2,K)
|
||||
PoolGenotypeLikelihoods.SumIterator firstIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
|
||||
final double[] x = originalPool.getLikelihoodsAsVector(true);
|
||||
while(firstIterator.hasNext()) {
|
||||
x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
|
||||
firstIterator.next();
|
||||
}
|
||||
|
||||
PoolGenotypeLikelihoods.SumIterator secondIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
final double[] y = yy.clone();
|
||||
while(secondIterator.hasNext()) {
|
||||
y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
|
||||
secondIterator.next();
|
||||
}
|
||||
|
||||
// initialize output to -log10(choose(m1+m2,[k1 k2...])
|
||||
final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
|
||||
final PoolGenotypeLikelihoods.SumIterator outputIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
|
||||
|
||||
|
||||
// Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
|
||||
while(outputIterator.hasNext()) {
|
||||
final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
|
||||
double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
|
||||
|
||||
originalPool.add(likelihood, set, outputIterator.getLinearIndex());
|
||||
outputIterator.next();
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute likelihood of a particular AC conformation and update AFresult object
|
||||
* @param set Set of AC counts to compute
|
||||
* @param firstGLs Original pool likelihoods before combining
|
||||
* @param secondGL New GL vector with additional pool
|
||||
* @param log10AlleleFrequencyPriors Allele frequency priors
|
||||
* @param numAlleles Number of alleles (including ref)
|
||||
* @param ploidy1 Ploidy of original pool (combined)
|
||||
* @param ploidy2 Ploidy of new pool
|
||||
* @param result AFResult object
|
||||
* @return log-likehood of requested conformation
|
||||
*/
|
||||
private static double computeLofK(final ExactACset set,
|
||||
final CombinedPoolLikelihoods firstGLs,
|
||||
final double[] secondGL,
|
||||
final double[] log10AlleleFrequencyPriors,
|
||||
final int numAlleles, final int ploidy1, final int ploidy2,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
// sanity check
|
||||
int totalAltK = set.getACsum();
|
||||
if (newPloidy != totalAltK)
|
||||
throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
|
||||
|
||||
totalAltK -= set.ACcounts.counts[0];
|
||||
// totalAltK has sum of alt alleles of conformation now
|
||||
|
||||
|
||||
// special case for k = 0 over all k
|
||||
if ( totalAltK == 0 ) { // all-ref case
|
||||
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
|
||||
set.log10Likelihoods[0] = log10Lof0;
|
||||
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
|
||||
} else {
|
||||
|
||||
// initialize result with denominator
|
||||
// ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
|
||||
// To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
|
||||
|
||||
int[] currentCount = set.ACcounts.getCounts();
|
||||
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
|
||||
|
||||
// for current conformation, get all possible ways to break vector K into two components G1 and G2
|
||||
final PoolGenotypeLikelihoods.SumIterator innerIterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
|
||||
set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY;
|
||||
while (innerIterator.hasNext()) {
|
||||
// check if breaking current conformation into g1 and g2 is feasible.
|
||||
final int[] acCount2 = innerIterator.getCurrentVector();
|
||||
final int[] acCount1 = MathUtils.vectorDiff(currentCount, acCount2);
|
||||
final int idx2 = innerIterator.getLinearIndex();
|
||||
// see if conformation is valid and if original pool had this conformation
|
||||
// for conformation to be valid, all elements of g2 have to be <= elements of current AC set
|
||||
if (isValidConformation(acCount1,ploidy1) && firstGLs.hasConformation(acCount1)) {
|
||||
final double gl2 = secondGL[idx2];
|
||||
if (!Double.isInfinite(gl2)) {
|
||||
final double firstGL = firstGLs.getLikelihoodOfConformation(acCount1);
|
||||
final double num1 = MathUtils.log10MultinomialCoefficient(ploidy1, acCount1);
|
||||
final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
|
||||
final double sum = firstGL + gl2 + num1 + num2;
|
||||
|
||||
set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum);
|
||||
}
|
||||
}
|
||||
innerIterator.next();
|
||||
}
|
||||
|
||||
set.log10Likelihoods[0] += denom;
|
||||
}
|
||||
|
||||
double log10LofK = set.log10Likelihoods[0];
|
||||
|
||||
// update the MLE if necessary
|
||||
final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length);
|
||||
result.updateMLEifNeeded(log10LofK, altCounts);
|
||||
|
||||
// apply the priors over each alternate allele
|
||||
for (final int ACcount : altCounts ) {
|
||||
if ( ACcount > 0 )
|
||||
log10LofK += log10AlleleFrequencyPriors[ACcount];
|
||||
}
|
||||
result.updateMAPifNeeded(log10LofK, altCounts);
|
||||
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
/**
|
||||
* Small helper routine - is a particular AC conformationv vector valid? ie are all elements non-negative and sum to ploidy?
|
||||
* @param set AC conformation vector
|
||||
* @param ploidy Ploidy of set
|
||||
* @return Valid conformation
|
||||
*/
|
||||
private static boolean isValidConformation(final int[] set, final int ploidy) {
|
||||
int sum=0;
|
||||
for (final int ac: set) {
|
||||
if (ac < 0)
|
||||
return false;
|
||||
sum += ac;
|
||||
|
||||
}
|
||||
|
||||
return (sum == ploidy);
|
||||
}
|
||||
|
||||
/**
|
||||
* Combines naively two biallelic pools (of arbitrary size).
|
||||
* For two pools of size m1 and m2, we can compute the combined likelihood as:
|
||||
* Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
|
||||
* @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
|
||||
* @param newPLVector Second GL vector
|
||||
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
|
||||
* @param ploidy2 Ploidy of second pool
|
||||
* @param log10AlleleFrequencyPriors Array of biallelic priors
|
||||
* @param result Af calculation result object
|
||||
* @return Combined likelihood vector
|
||||
*/
|
||||
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
|
||||
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
|
||||
final AlleleFrequencyCalculationResult result) {
|
||||
|
||||
final int newPloidy = ploidy1 + ploidy2;
|
||||
|
||||
final double[] combinedLikelihoods = new double[1+newPloidy];
|
||||
|
||||
/** Pre-fill result array and incorporate weights into input vectors
|
||||
* Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
|
||||
* and L2(k) = Pr(D|AC2=k) * choose(m2,k)
|
||||
* equation reduces to
|
||||
* Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
|
||||
* which is just plain convolution of L1 and L2 (with pre-existing vector)
|
||||
*/
|
||||
|
||||
// intialize result vector to -infinity
|
||||
Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
|
||||
|
||||
final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
|
||||
for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
|
||||
x[k] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
final double[] y = newPLVector.clone();
|
||||
|
||||
|
||||
final double log10Lof0 = x[0]+y[0];
|
||||
result.setLog10LikelihoodOfAFzero(log10Lof0);
|
||||
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
|
||||
|
||||
double maxElement = log10Lof0;
|
||||
int maxElementIdx = 0;
|
||||
int[] alleleCounts = new int[1];
|
||||
for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
|
||||
double[] acc = new double[k+1];
|
||||
Arrays.fill(acc,Double.NEGATIVE_INFINITY);
|
||||
double innerMax = Double.NEGATIVE_INFINITY;
|
||||
|
||||
for (int j=0; j <=k; j++) {
|
||||
double x1,y1;
|
||||
|
||||
|
||||
if (k-j>=0 && k-j < y.length)
|
||||
y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
|
||||
else
|
||||
continue;
|
||||
|
||||
if (j < x.length)
|
||||
x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
|
||||
else
|
||||
continue;
|
||||
|
||||
if (Double.isInfinite(x1) || Double.isInfinite(y1))
|
||||
continue;
|
||||
acc[j] = x1 + y1;
|
||||
if (acc[j] > innerMax)
|
||||
innerMax = acc[j];
|
||||
else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
|
||||
break;
|
||||
}
|
||||
combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
|
||||
maxElementIdx = k;
|
||||
double maxDiff = combinedLikelihoods[k] - maxElement;
|
||||
if (maxDiff > 0)
|
||||
maxElement = combinedLikelihoods[k];
|
||||
else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
|
||||
break;
|
||||
}
|
||||
|
||||
alleleCounts[0] = k;
|
||||
result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
|
||||
result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
|
||||
* including updating the PL's, and assign genotypes accordingly
|
||||
* @param vc variant context with alleles and genotype likelihoods
|
||||
* @param allelesToUse alleles to subset
|
||||
* @param assignGenotypes true: assign hard genotypes, false: leave as no-call
|
||||
* @param ploidy number of chromosomes per sample (pool)
|
||||
* @return GenotypesContext with new PLs
|
||||
*/
|
||||
public GenotypesContext subsetAlleles(final VariantContext vc,
|
||||
final List<Allele> allelesToUse,
|
||||
final boolean assignGenotypes,
|
||||
final int ploidy) {
|
||||
// the genotypes with PLs
|
||||
final GenotypesContext oldGTs = vc.getGenotypes();
|
||||
List<Allele> NO_CALL_ALLELES = new ArrayList<Allele>(ploidy);
|
||||
|
||||
for (int k=0; k < ploidy; k++)
|
||||
NO_CALL_ALLELES.add(Allele.NO_CALL);
|
||||
|
||||
// samples
|
||||
final List<String> sampleIndices = oldGTs.getSampleNamesOrderedByName();
|
||||
|
||||
// the new genotypes to create
|
||||
final GenotypesContext newGTs = GenotypesContext.create();
|
||||
|
||||
// we need to determine which of the alternate alleles (and hence the likelihoods) to use and carry forward
|
||||
final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
|
||||
final int numNewAltAlleles = allelesToUse.size() - 1;
|
||||
|
||||
|
||||
// create the new genotypes
|
||||
for ( int k = 0; k < oldGTs.size(); k++ ) {
|
||||
final Genotype g = oldGTs.get(sampleIndices.get(k));
|
||||
if ( !g.hasLikelihoods() ) {
|
||||
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
|
||||
continue;
|
||||
}
|
||||
|
||||
// create the new likelihoods array from the alleles we are allowed to use
|
||||
final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
|
||||
double[] newLikelihoods;
|
||||
if ( numOriginalAltAlleles == numNewAltAlleles) {
|
||||
newLikelihoods = originalLikelihoods;
|
||||
} else {
|
||||
newLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(),allelesToUse);
|
||||
|
||||
// might need to re-normalize
|
||||
newLikelihoods = MathUtils.normalizeFromLog10(newLikelihoods, false, true);
|
||||
}
|
||||
|
||||
// if there is no mass on the (new) likelihoods, then just no-call the sample
|
||||
if ( MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL ) {
|
||||
newGTs.add(GenotypeBuilder.create(g.getSampleName(), NO_CALL_ALLELES));
|
||||
}
|
||||
else {
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(g);
|
||||
|
||||
if ( numNewAltAlleles == 0 )
|
||||
gb.noPL();
|
||||
else
|
||||
gb.PL(newLikelihoods);
|
||||
|
||||
// if we weren't asked to assign a genotype, then just no-call the sample
|
||||
if ( !assignGenotypes || MathUtils.sum(newLikelihoods) > VariantContextUtils.SUM_GL_THRESH_NOCALL )
|
||||
gb.alleles(NO_CALL_ALLELES);
|
||||
else
|
||||
assignGenotype(gb, newLikelihoods, allelesToUse, ploidy);
|
||||
newGTs.add(gb.make());
|
||||
}
|
||||
}
|
||||
|
||||
return newGTs;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Assign genotypes (GTs) to the samples in the Variant Context greedily based on the PLs
|
||||
*
|
||||
* @param newLikelihoods the PL array
|
||||
* @param allelesToUse the list of alleles to choose from (corresponding to the PLs)
|
||||
* @param numChromosomes Number of chromosomes per pool
|
||||
*
|
||||
* @return genotype
|
||||
*/
|
||||
private static void assignGenotype(final GenotypeBuilder gb,
|
||||
final double[] newLikelihoods,
|
||||
final List<Allele> allelesToUse,
|
||||
final int numChromosomes) {
|
||||
final int numNewAltAlleles = allelesToUse.size() - 1;
|
||||
|
||||
|
||||
|
||||
// find the genotype with maximum likelihoods
|
||||
final int PLindex = numNewAltAlleles == 0 ? 0 : MathUtils.maxElementIndex(newLikelihoods);
|
||||
|
||||
final int[] mlAlleleCount = PoolGenotypeLikelihoods.getAlleleCountFromPLIndex(allelesToUse.size(), numChromosomes, PLindex);
|
||||
final ArrayList<Double> alleleFreqs = new ArrayList<Double>();
|
||||
final ArrayList<Integer> alleleCounts = new ArrayList<Integer>();
|
||||
|
||||
|
||||
for (int k=1; k < mlAlleleCount.length; k++) {
|
||||
alleleCounts.add(mlAlleleCount[k]);
|
||||
final double freq = (double)mlAlleleCount[k] / (double)numChromosomes;
|
||||
alleleFreqs.add(freq);
|
||||
|
||||
}
|
||||
|
||||
// per-pool logging of AC and AF
|
||||
gb.attribute(VCFConstants.MLE_ALLELE_COUNT_KEY, alleleCounts.size() == 1 ? alleleCounts.get(0) : alleleCounts);
|
||||
gb.attribute(VCFConstants.MLE_ALLELE_FREQUENCY_KEY, alleleFreqs.size() == 1 ? alleleFreqs.get(0) : alleleFreqs);
|
||||
|
||||
// remove PLs if necessary
|
||||
if (newLikelihoods.length > MAX_LENGTH_FOR_POOL_PL_LOGGING)
|
||||
gb.noPL();
|
||||
|
||||
ArrayList<Allele> myAlleles = new ArrayList<Allele>();
|
||||
|
||||
// add list of called ML genotypes to alleles list
|
||||
// TODO - too unwieldy?
|
||||
int idx = 0;
|
||||
for (int mlind = 0; mlind < mlAlleleCount.length; mlind++) {
|
||||
for (int k=0; k < mlAlleleCount[mlind]; k++)
|
||||
myAlleles.add(idx++,allelesToUse.get(mlind));
|
||||
}
|
||||
gb.alleles(myAlleles);
|
||||
|
||||
if ( numNewAltAlleles > 0 )
|
||||
gb.log10PError(GenotypeLikelihoods.getGQLog10FromLikelihoods(PLindex, newLikelihoods));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,656 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeLikelihoods;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public abstract class PoolGenotypeLikelihoods {
|
||||
protected final int numChromosomes;
|
||||
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
|
||||
|
||||
protected static final boolean VERBOSE = false;
|
||||
protected static final double qualVec[] = new double[SAMUtils.MAX_PHRED_SCORE+1];
|
||||
|
||||
//
|
||||
// The fundamental data arrays associated with a Genotype Likelhoods object
|
||||
//
|
||||
protected double[] log10Likelihoods;
|
||||
protected double[][] logMismatchProbabilityArray;
|
||||
|
||||
protected final int nSamplesPerPool;
|
||||
protected final HashMap<String, ErrorModel> perLaneErrorModels;
|
||||
protected final int likelihoodDim;
|
||||
protected final boolean ignoreLaneInformation;
|
||||
protected final double LOG10_PLOIDY;
|
||||
protected boolean hasReferenceSampleData;
|
||||
|
||||
protected final int nAlleles;
|
||||
protected final List<Allele> alleles;
|
||||
|
||||
private static final double MIN_LIKELIHOOD = Double.NEGATIVE_INFINITY;
|
||||
|
||||
private static final int MAX_NUM_ALLELES_TO_CACHE = 20;
|
||||
private static final int MAX_NUM_SAMPLES_PER_POOL = 1000;
|
||||
|
||||
private static final boolean FAST_GL_COMPUTATION = true;
|
||||
// constructor with given logPL elements
|
||||
public PoolGenotypeLikelihoods(final List<Allele> alleles, final double[] logLikelihoods, final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels, final boolean ignoreLaneInformation) {
|
||||
this.alleles = alleles;
|
||||
this.nAlleles = alleles.size();
|
||||
numChromosomes = ploidy;
|
||||
nSamplesPerPool = numChromosomes/2;
|
||||
this.perLaneErrorModels = perLaneErrorModels;
|
||||
this.ignoreLaneInformation = ignoreLaneInformation;
|
||||
|
||||
// check if at least one lane has actual data
|
||||
if (perLaneErrorModels == null || perLaneErrorModels.isEmpty())
|
||||
hasReferenceSampleData = false;
|
||||
else {
|
||||
for (Map.Entry<String,ErrorModel> elt : perLaneErrorModels.entrySet()) {
|
||||
if (elt.getValue().hasData()) {
|
||||
hasReferenceSampleData = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// check sizes
|
||||
if (nAlleles > MAX_NUM_ALLELES_TO_CACHE)
|
||||
throw new UserException("No support for this number of alleles");
|
||||
|
||||
if (nSamplesPerPool > MAX_NUM_SAMPLES_PER_POOL)
|
||||
throw new UserException("No support for such large number of samples per pool");
|
||||
|
||||
likelihoodDim = GenotypeLikelihoods.numLikelihoods(nAlleles, numChromosomes);
|
||||
|
||||
if (logLikelihoods == null){
|
||||
log10Likelihoods = new double[likelihoodDim];
|
||||
Arrays.fill(log10Likelihoods, MIN_LIKELIHOOD);
|
||||
} else {
|
||||
if (logLikelihoods.length != likelihoodDim)
|
||||
throw new ReviewedStingException("BUG: inconsistent parameters when creating PoolGenotypeLikelihoods object");
|
||||
|
||||
log10Likelihoods = logLikelihoods; //.clone(); // is clone needed?
|
||||
}
|
||||
fillCache();
|
||||
LOG10_PLOIDY = Math.log10((double)numChromosomes);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Crucial inner class that handles addressing elements of pool likelihoods. We store likelihoods as a map
|
||||
* of form int[] -> double (to be more precise, IntArrayWrapper -> Double).
|
||||
* For a given ploidy (chromosome count) and number of alleles, we need a form to iterate deterministically
|
||||
* across all possible allele conformations.
|
||||
* Problem equivalent to listing in determistic order all possible ways in which N integers will sum to P,
|
||||
* where N is number of alleles and P is number of chromosomes.
|
||||
* There's an option to list all integers so that sum will be UP to P.
|
||||
* For example, with P=2,N=2, restrictSumTo = 2 iterator will produce
|
||||
* [2 0 ] [1 1] [ 0 2]
|
||||
*
|
||||
*
|
||||
*/
|
||||
protected static class SumIterator {
|
||||
private int[] currentState;
|
||||
private final int[] finalState;
|
||||
private final int restrictSumTo;
|
||||
private final int dim;
|
||||
private boolean hasNext;
|
||||
private int linearIndex;
|
||||
private int currentSum;
|
||||
|
||||
/**
|
||||
* Default constructor. Typical use case: restrictSumTo = -1 if there's no sum restriction, or will generate int[]
|
||||
* vectors so that all add to this value.
|
||||
*
|
||||
* @param finalState End state - typically we should set value to (P,P,P,...)
|
||||
* @param restrictSumTo See above
|
||||
*/
|
||||
public SumIterator(final int[] finalState,final int restrictSumTo) {
|
||||
this.finalState = finalState;
|
||||
this.dim = finalState.length;
|
||||
this.restrictSumTo = restrictSumTo;
|
||||
currentState = new int[dim];
|
||||
reset();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Shortcut constructor for common use case: iterator will produce
|
||||
* all vectors of length numAlleles whose sum = numChromosomes
|
||||
* @param numAlleles Number of alleles
|
||||
* @param numChromosomes Ploidy
|
||||
*/
|
||||
public SumIterator(final int numAlleles, final int numChromosomes) {
|
||||
this(getInitialStateVector(numAlleles,numChromosomes), numChromosomes);
|
||||
}
|
||||
|
||||
|
||||
private static int[] getInitialStateVector(final int nAlleles, final int numChromosomes) {
|
||||
int[] initialState = new int[nAlleles];
|
||||
Arrays.fill(initialState,numChromosomes);
|
||||
return initialState;
|
||||
}
|
||||
|
||||
public void setInitialStateVector(final int[] stateVector) {
|
||||
if (restrictSumTo > 0) {
|
||||
// check that desired vector is valid
|
||||
if (MathUtils.sum(stateVector) != restrictSumTo)
|
||||
throw new ReviewedStingException("BUG: initial state vector nor compatible with sum iterator");
|
||||
|
||||
final int numAlleles = currentState.length;
|
||||
final int ploidy = restrictSumTo;
|
||||
|
||||
linearIndex = PoolGenotypeLikelihoods.getLinearIndex(stateVector, numAlleles, ploidy);
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("BUG: Not supported");
|
||||
|
||||
}
|
||||
public void next() {
|
||||
int initialDim = (restrictSumTo > 0)?1:0;
|
||||
hasNext = next(finalState, initialDim);
|
||||
if (hasNext)
|
||||
linearIndex++;
|
||||
}
|
||||
|
||||
private boolean next(final int[] finalState, int initialDim) {
|
||||
boolean hasNextState = false;
|
||||
for (int currentDim=initialDim; currentDim < finalState.length; currentDim++) {
|
||||
final int x = currentState[currentDim]+1;
|
||||
|
||||
if (x > finalState[currentDim] || (currentSum >= restrictSumTo && initialDim > 0)) {
|
||||
// update vector sum, and reset position
|
||||
currentSum -= currentState[currentDim];
|
||||
currentState[currentDim] = 0;
|
||||
if (currentDim >= dim-1) {
|
||||
hasNextState = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else {
|
||||
currentState[currentDim] = x;
|
||||
hasNextState = true;
|
||||
currentSum++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (initialDim > 0) {
|
||||
currentState[0] = restrictSumTo - currentSum;
|
||||
}
|
||||
return hasNextState;
|
||||
}
|
||||
|
||||
public void reset() {
|
||||
Arrays.fill(currentState, 0);
|
||||
if (restrictSumTo > 0)
|
||||
currentState[0] = restrictSumTo;
|
||||
hasNext = true;
|
||||
linearIndex = 0;
|
||||
currentSum = 0;
|
||||
}
|
||||
public int[] getCurrentVector() {
|
||||
return currentState;
|
||||
}
|
||||
|
||||
public int[] getCurrentAltVector() {
|
||||
return Arrays.copyOfRange(currentState,1,currentState.length);
|
||||
}
|
||||
/* public int getCurrentSum() {
|
||||
return currentSum;
|
||||
}
|
||||
*/
|
||||
public int getLinearIndex() {
|
||||
return linearIndex;
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
return hasNext;
|
||||
}
|
||||
}
|
||||
|
||||
public List<Allele> getAlleles() { return alleles;}
|
||||
|
||||
/**
|
||||
* Returns an array of log10 likelihoods for each genotype conformation, with ordering determined by SumIterator class.
|
||||
*
|
||||
* @return likelihoods array
|
||||
*/
|
||||
public double[] getLikelihoods() {
|
||||
return log10Likelihoods;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Set particular element of logPL vector
|
||||
* @param idx index of allele count conformation to modify
|
||||
* @param pl Likelihood to associate with map
|
||||
*/
|
||||
public void setLogPLs(final int idx, final double pl) {
|
||||
log10Likelihoods[idx] = pl;
|
||||
}
|
||||
|
||||
public void renormalize() {
|
||||
log10Likelihoods = MathUtils.normalizeFromLog10(log10Likelihoods,false,true);
|
||||
}
|
||||
/** Compute most likely AC conformation based on currently stored PL's - just loop through log PL map and output max value
|
||||
*
|
||||
* @return vector with most likely allele count, ordered according to this object's alleles
|
||||
*/
|
||||
public Pair<int[],Double> getMostLikelyACCount() {
|
||||
|
||||
int[] mlInd = null;
|
||||
double maxVal = Double.NEGATIVE_INFINITY;
|
||||
|
||||
final SumIterator iterator = new SumIterator(alleles.size(),numChromosomes);
|
||||
|
||||
int idx = 0;
|
||||
while (iterator.hasNext()) {
|
||||
double pl = log10Likelihoods[idx++];
|
||||
if (pl > maxVal) {
|
||||
maxVal = pl;
|
||||
mlInd = iterator.getCurrentVector().clone();
|
||||
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
if (VERBOSE) {
|
||||
System.out.println(VCFConstants.MLE_ALLELE_COUNT_KEY + ": " + Arrays.toString(mlInd));
|
||||
}
|
||||
return new Pair<int[], Double>(mlInd,maxVal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given set of alleles with corresponding vector of likelihoods, subset to a new set of alleles
|
||||
*
|
||||
* @param oldLikelihoods Vector of PL's corresponding to original alleles
|
||||
* @param numChromosomes Ploidy (number of chromosomes describing PL's)
|
||||
* @param originalAlleles List of original alleles
|
||||
* @param allelesToSubset Alleles to subset
|
||||
* @return Vector of new PL's, ordered accorrding to SumIterator's ordering
|
||||
*/
|
||||
public static double[] subsetToAlleles(final double[] oldLikelihoods, final int numChromosomes,
|
||||
final List<Allele> originalAlleles, final List<Allele> allelesToSubset) {
|
||||
|
||||
int newPLSize = PoolGenotypeLikelihoods.getNumLikelihoodElements(allelesToSubset.size(), numChromosomes);
|
||||
double[] newPLs = new double[newPLSize];
|
||||
|
||||
|
||||
int idx = 0;
|
||||
// First fill boolean array stating whether each original allele is present in new mapping
|
||||
final boolean [] allelePresent = new boolean[originalAlleles.size()];
|
||||
for ( Allele allele : originalAlleles )
|
||||
allelePresent[idx++] = allelesToSubset.contains(allele);
|
||||
|
||||
|
||||
// compute mapping from old idx to new idx
|
||||
// This might be needed in case new allele set is not ordered in the same way as old set
|
||||
// Example. Original alleles: {T*,C,G,A}. New alleles: {G,C}. Permutation key = [2,1]
|
||||
|
||||
int[] permutationKey = new int[allelesToSubset.size()];
|
||||
for (int k=0; k < allelesToSubset.size(); k++)
|
||||
// for each allele to subset, find corresponding index in original allele list
|
||||
permutationKey[k] = originalAlleles.indexOf(allelesToSubset.get(k));
|
||||
|
||||
|
||||
if (VERBOSE) {
|
||||
System.out.println("permutationKey:"+Arrays.toString(permutationKey));
|
||||
}
|
||||
|
||||
final SumIterator iterator = new SumIterator(originalAlleles.size(),numChromosomes);
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
// for each entry in logPL table, associated originally with allele count stored in vec[],
|
||||
// see if this allele count conformation will be present in new logPL table.
|
||||
// For entry to be present, elements in dimensions not present in requested allele list have to have count = 0
|
||||
int[] pVec = iterator.getCurrentVector();
|
||||
double pl = oldLikelihoods[iterator.getLinearIndex()];
|
||||
|
||||
boolean keyPresent = true;
|
||||
for (int k=0; k < allelePresent.length; k++)
|
||||
if ( pVec[k]>0 && !allelePresent[k] )
|
||||
keyPresent = false;
|
||||
|
||||
if (keyPresent) {// skip to next entry in logPLs if this conformation is not present in subset
|
||||
|
||||
final int[] newCount = new int[allelesToSubset.size()];
|
||||
|
||||
// map from old allele mapping count to new allele mapping
|
||||
// In pseudo-Matlab notation: newCount = vec[permutationKey] for permutationKey vector
|
||||
for (idx = 0; idx < newCount.length; idx++)
|
||||
newCount[idx] = pVec[permutationKey[idx]];
|
||||
|
||||
// get corresponding index from new count
|
||||
int outputIdx = PoolGenotypeLikelihoods.getLinearIndex(newCount, allelesToSubset.size(), numChromosomes);
|
||||
newPLs[outputIdx] = pl;
|
||||
if (VERBOSE) {
|
||||
System.out.println("Old Key:"+Arrays.toString(pVec));
|
||||
System.out.println("New Key:"+Arrays.toString(newCount));
|
||||
}
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
return newPLs;
|
||||
}
|
||||
|
||||
public static int getLinearIndex(int[] vectorIdx, int numAlleles, int ploidy) {
|
||||
|
||||
if (ploidy <= 0)
|
||||
return 0;
|
||||
|
||||
int linearIdx = 0;
|
||||
int cumSum = ploidy;
|
||||
for (int k=numAlleles-1;k>=1; k--) {
|
||||
int idx = vectorIdx[k];
|
||||
// how many blocks are before current position
|
||||
if (idx == 0)
|
||||
continue;
|
||||
for (int p=0; p < idx; p++)
|
||||
linearIdx += getNumLikelihoodElements( k, cumSum-p);
|
||||
|
||||
cumSum -= idx;
|
||||
}
|
||||
|
||||
return linearIdx;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a scalar index, what's the alelle count conformation corresponding to it?
|
||||
* @param nAlleles Number of alleles
|
||||
* @param numChromosomes Ploidy
|
||||
* @param PLindex Index to query
|
||||
* @return Allele count conformation, according to iteration order from SumIterator
|
||||
*/
|
||||
public static int[] getAlleleCountFromPLIndex(final int nAlleles, final int numChromosomes, final int PLindex) {
|
||||
|
||||
// todo - another brain-dead inefficient implementation, can do much better by computing in closed form
|
||||
final SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
final int[] plVec = iterator.getCurrentVector();
|
||||
if (iterator.getLinearIndex() == PLindex)
|
||||
return plVec;
|
||||
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* a cache of the PL ivector sizes as a function of # of alleles and pool sizes
|
||||
*/
|
||||
|
||||
public static int getNumLikelihoodElements(int numAlleles, int ploidy) {
|
||||
return GenotypeLikelihoodVectorSizes[numAlleles][ploidy];
|
||||
}
|
||||
|
||||
private final static int[][] GenotypeLikelihoodVectorSizes = fillGLVectorSizeCache(MAX_NUM_ALLELES_TO_CACHE, 2*MAX_NUM_SAMPLES_PER_POOL);
|
||||
|
||||
private static int[][] fillGLVectorSizeCache(int maxAlleles, int maxPloidy) {
|
||||
|
||||
int[][] cache = new int[maxAlleles][maxPloidy];
|
||||
for (int numAlleles=1; numAlleles < maxAlleles; numAlleles++) {
|
||||
for (int ploidy=0; ploidy < maxPloidy; ploidy++) {
|
||||
|
||||
if (numAlleles == 1)
|
||||
cache[numAlleles][ploidy] = 1;
|
||||
else if (ploidy == 1)
|
||||
cache[numAlleles][ploidy] = numAlleles;
|
||||
else {
|
||||
int acc =0;
|
||||
for (int k=0; k <= ploidy; k++ )
|
||||
acc += cache[numAlleles-1][ploidy-k];
|
||||
|
||||
cache[numAlleles][ploidy] = acc;
|
||||
}
|
||||
}
|
||||
}
|
||||
return cache;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a string representation of this object in a moderately usable form
|
||||
*
|
||||
* @return string representation
|
||||
*/
|
||||
public String toString() {
|
||||
StringBuilder s = new StringBuilder(1000);
|
||||
|
||||
s.append("Alleles:");
|
||||
for (Allele a: this.alleles){
|
||||
s.append(a.getDisplayString());
|
||||
s.append(",");
|
||||
}
|
||||
s.append("\nGLs:\n");
|
||||
SumIterator iterator = new SumIterator(nAlleles,numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
if (!Double.isInfinite(getLikelihoods()[iterator.getLinearIndex()])) {
|
||||
|
||||
s.append("Count [");
|
||||
StringBuilder b = new StringBuilder(iterator.getCurrentVector().length*2);
|
||||
for (int it:iterator.getCurrentVector()) {
|
||||
b.append(it);
|
||||
b.append(",");
|
||||
}
|
||||
s.append(b.toString());
|
||||
s.append(String.format("] GL=%4.3f\n",this.getLikelihoods()[iterator.getLinearIndex()]) );
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
return s.toString();
|
||||
}
|
||||
|
||||
|
||||
public void computeLikelihoods(ErrorModel errorModel,
|
||||
List<Allele> alleleList, List<Integer> numObservations, ReadBackedPileup pileup) {
|
||||
|
||||
if (FAST_GL_COMPUTATION) {
|
||||
// queue up elements to be computed. Assumptions:
|
||||
// GLs distributions are unimodal
|
||||
// GLs are continuous
|
||||
// Hence, once an AC conformation is computed, we queue up its immediate topological neighbors.
|
||||
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
|
||||
// and we repeat until queue is empty
|
||||
// queue of AC conformations to process
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>();
|
||||
// mapping of ExactACset indexes to the objects
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim);
|
||||
// add AC=0 to the queue
|
||||
final int[] zeroCounts = new int[nAlleles];
|
||||
zeroCounts[0] = numChromosomes;
|
||||
|
||||
AlleleFrequencyCalculationModel.ExactACset zeroSet =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts));
|
||||
|
||||
ACqueue.add(zeroSet);
|
||||
indexesToACset.put(zeroSet.ACcounts, zeroSet);
|
||||
|
||||
// keep processing while we have AC conformations that need to be calculated
|
||||
double maxLog10L = Double.NEGATIVE_INFINITY;
|
||||
while ( !ACqueue.isEmpty() ) {
|
||||
// compute log10Likelihoods
|
||||
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove();
|
||||
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
|
||||
|
||||
// adjust max likelihood seen if needed
|
||||
maxLog10L = Math.max(maxLog10L, log10LofKs);
|
||||
// clean up memory
|
||||
indexesToACset.remove(ACset.ACcounts);
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts);
|
||||
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
int plIdx = 0;
|
||||
SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
|
||||
while (iterator.hasNext()) {
|
||||
AlleleFrequencyCalculationModel.ExactACset ACset =
|
||||
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector()));
|
||||
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model
|
||||
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
|
||||
|
||||
setLogPLs(plIdx++, ACset.log10Likelihoods[0]);
|
||||
iterator.next();
|
||||
}
|
||||
}
|
||||
// normalize PL's
|
||||
renormalize();
|
||||
|
||||
}
|
||||
|
||||
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final double maxLog10L,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts,
|
||||
AlleleFrequencyCalculationModel.ExactACset> indexesToACset,
|
||||
final ReadBackedPileup pileup) {
|
||||
// compute likelihood of set
|
||||
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
|
||||
final double log10LofK = set.log10Likelihoods[0];
|
||||
|
||||
// log result in PL vector
|
||||
int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes);
|
||||
setLogPLs(idx, log10LofK);
|
||||
|
||||
// can we abort early because the log10Likelihoods are so small?
|
||||
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
|
||||
if ( VERBOSE )
|
||||
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
|
||||
return log10LofK;
|
||||
}
|
||||
|
||||
// iterate over higher frequencies if possible
|
||||
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
|
||||
final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0];
|
||||
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
|
||||
return log10LofK;
|
||||
|
||||
|
||||
// add conformations for other cases
|
||||
for ( int allele = 1; allele < nAlleles; allele++ ) {
|
||||
final int[] ACcountsClone = set.ACcounts.getCounts().clone();
|
||||
ACcountsClone[allele]++;
|
||||
// is this a valid conformation?
|
||||
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
|
||||
ACcountsClone[0] = numChromosomes - altSum;
|
||||
if (ACcountsClone[0] < 0)
|
||||
continue;
|
||||
|
||||
|
||||
updateACset(ACcountsClone, ACqueue, indexesToACset);
|
||||
}
|
||||
return log10LofK;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Abstract methods, must be implemented in subclasses
|
||||
*
|
||||
* @param ACset Count to compute
|
||||
* @param errorModel Site-specific error model object
|
||||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele
|
||||
* @param pileup Read backed pileup in case it's necessary
|
||||
*/
|
||||
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup);
|
||||
|
||||
|
||||
public abstract int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC);
|
||||
|
||||
// Static methods
|
||||
public static void updateACset(final int[] newSetCounts,
|
||||
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue,
|
||||
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) {
|
||||
|
||||
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts);
|
||||
if ( !indexesToACset.containsKey(index) ) {
|
||||
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index);
|
||||
indexesToACset.put(index, newSet);
|
||||
ACqueue.add(newSet);
|
||||
if (VERBOSE)
|
||||
System.out.println(" *** Adding set to queue:" + index.toString());
|
||||
}
|
||||
|
||||
}
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
//
|
||||
// helper routines
|
||||
//
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
||||
//
|
||||
// Constant static data
|
||||
//
|
||||
|
||||
static {
|
||||
// cache 10^(-k/10)
|
||||
for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++)
|
||||
qualVec[j] = Math.pow(10.0,-(double)j/10.0);
|
||||
}
|
||||
|
||||
private void fillCache() {
|
||||
// cache Q(j,k) = log10(j/2N*(1-ek) + (2N-j)/2N*ek) for j = 0:2N
|
||||
|
||||
logMismatchProbabilityArray = new double[1+numChromosomes][1+SAMUtils.MAX_PHRED_SCORE];
|
||||
for (int i=0; i <= numChromosomes; i++) {
|
||||
for (int j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) {
|
||||
double phi = (double)i/numChromosomes;
|
||||
logMismatchProbabilityArray[i][j] = Math.log10(phi * (1.0-qualVec[j]) + qualVec[j]/3.0 * (1.0-phi));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,353 @@
|
|||
/*
|
||||
* Copyright (c) 2010, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public abstract class PoolGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
|
||||
|
||||
//protected Set<String> laneIDs;
|
||||
public enum Model {
|
||||
SNP,
|
||||
INDEL,
|
||||
POOLSNP,
|
||||
POOLINDEL,
|
||||
BOTH
|
||||
}
|
||||
|
||||
final protected UnifiedArgumentCollection UAC;
|
||||
|
||||
protected PoolGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC,logger);
|
||||
this.UAC = UAC;
|
||||
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
Get vc with alleles from reference sample. Can be null if there's no ref sample call or no ref sample coverage at this site.
|
||||
*/
|
||||
protected VariantContext getTrueAlleles(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String,AlignmentContext> contexts) {
|
||||
// Get reference base from VCF or Reference
|
||||
if (UAC.referenceSampleName == null)
|
||||
return null;
|
||||
|
||||
AlignmentContext context = contexts.get(UAC.referenceSampleName);
|
||||
ArrayList<Allele> trueReferenceAlleles = new ArrayList<Allele>();
|
||||
|
||||
VariantContext referenceSampleVC;
|
||||
|
||||
if (tracker != null && context != null)
|
||||
referenceSampleVC = tracker.getFirstValue(UAC.referenceSampleRod, context.getLocation());
|
||||
else
|
||||
return null;
|
||||
|
||||
if (referenceSampleVC == null) {
|
||||
trueReferenceAlleles.add(Allele.create(ref.getBase(),true));
|
||||
return new VariantContextBuilder("pc",ref.getLocus().getContig(), ref.getLocus().getStart(), ref.getLocus().getStop(),trueReferenceAlleles).make();
|
||||
|
||||
}
|
||||
else {
|
||||
Genotype referenceGenotype = referenceSampleVC.getGenotype(UAC.referenceSampleName);
|
||||
List<Allele> referenceAlleles = referenceGenotype.getAlleles();
|
||||
|
||||
return new VariantContextBuilder("pc",referenceSampleVC.getChr(), referenceSampleVC.getStart(), referenceSampleVC.getEnd(),
|
||||
referenceSampleVC.getAlleles())
|
||||
.referenceBaseForIndel(referenceSampleVC.getReferenceBaseForIndel())
|
||||
.genotypes(new GenotypeBuilder(UAC.referenceSampleName, referenceAlleles).GQ(referenceGenotype.getGQ()).make())
|
||||
.make();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
||||
* XXX.Y is the unique lane identifier.
|
||||
* Z is the id of the sample to make the read group id unique
|
||||
* This function returns the list of lane identifiers.
|
||||
*
|
||||
* @param readGroups readGroups A collection of read group strings (obtained from the alignment context pileup)
|
||||
* @return a collection of lane ids.
|
||||
*/
|
||||
public static Set<String> parseLaneIDs(Collection<String> readGroups) {
|
||||
HashSet<String> result = new HashSet<String>();
|
||||
for (String readGroup : readGroups) {
|
||||
result.add(getLaneIDFromReadGroupString(readGroup));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* GATK Engine creates readgroups of the form XXX.Y.Z
|
||||
* XXX.Y is the unique lane identifier.
|
||||
* Z is the id of the sample to make the read group id unique
|
||||
*
|
||||
* @param readGroupID the read group id string
|
||||
* @return just the lane id (the XXX.Y string)
|
||||
*/
|
||||
public static String getLaneIDFromReadGroupString(String readGroupID) {
|
||||
// System.out.println(readGroupID);
|
||||
String [] parsedID = readGroupID.split("\\.");
|
||||
if (parsedID.length > 1)
|
||||
return parsedID[0] + "." + parsedID[1];
|
||||
else
|
||||
return parsedID[0] + ".0";
|
||||
}
|
||||
|
||||
|
||||
/** Wrapper class that encapsulates likelihood object and sample name
|
||||
*
|
||||
*/
|
||||
protected static class PoolGenotypeData {
|
||||
|
||||
public final String name;
|
||||
public final PoolGenotypeLikelihoods GL;
|
||||
public final int depth;
|
||||
public final List<Allele> alleles;
|
||||
|
||||
public PoolGenotypeData(final String name, final PoolGenotypeLikelihoods GL, final int depth, final List<Allele> alleles) {
|
||||
this.name = name;
|
||||
this.GL = GL;
|
||||
this.depth = depth;
|
||||
this.alleles = alleles;
|
||||
}
|
||||
}
|
||||
|
||||
// determines the alleles to use
|
||||
protected List<Allele> determineAlternateAlleles(final List<PoolGenotypeData> sampleDataList) {
|
||||
|
||||
if (sampleDataList.isEmpty())
|
||||
return Collections.emptyList();
|
||||
|
||||
final int REFERENCE_IDX = 0;
|
||||
final List<Allele> allAlleles = sampleDataList.get(0).GL.getAlleles();
|
||||
double[] likelihoodSums = new double[allAlleles.size()];
|
||||
|
||||
// based on the GLs, find the alternate alleles with enough probability
|
||||
for ( PoolGenotypeData sampleData : sampleDataList ) {
|
||||
final Pair<int[],Double> mlACPair = sampleData.GL.getMostLikelyACCount();
|
||||
final double topLogGL = mlACPair.second;
|
||||
|
||||
if (sampleData.GL.getAlleles().size() != allAlleles.size())
|
||||
throw new ReviewedStingException("BUG: inconsistent size of alleles!");
|
||||
|
||||
// ref allele is always first in array list
|
||||
if (sampleData.GL.alleles.get(0).isNonReference())
|
||||
throw new ReviewedStingException("BUG: first allele in list is not reference!");
|
||||
|
||||
double refGL = sampleData.GL.getLikelihoods()[REFERENCE_IDX];
|
||||
|
||||
// check if maximum likelihood AC is all-ref for current pool. If so, skip
|
||||
if (mlACPair.first[REFERENCE_IDX] == sampleData.GL.numChromosomes)
|
||||
continue;
|
||||
|
||||
// most likely AC is not all-ref: for all non-ref alleles, add difference of max likelihood and all-ref likelihood
|
||||
for (int i=0; i < mlACPair.first.length; i++) {
|
||||
if (i==REFERENCE_IDX) continue;
|
||||
|
||||
if (mlACPair.first[i] > 0)
|
||||
likelihoodSums[i] += topLogGL - refGL;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
final List<Allele> allelesToUse = new ArrayList<Allele>();
|
||||
for ( int i = 0; i < likelihoodSums.length; i++ ) {
|
||||
if ( likelihoodSums[i] > 0.0 )
|
||||
allelesToUse.add(allAlleles.get(i));
|
||||
}
|
||||
|
||||
return allelesToUse;
|
||||
}
|
||||
|
||||
|
||||
public VariantContext getLikelihoods(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final boolean useBAQedPileup,
|
||||
final GenomeLocParser locParser) {
|
||||
|
||||
HashMap<String, ErrorModel> perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts);
|
||||
if (perLaneErrorModels == null && UAC.referenceSampleName != null)
|
||||
return null;
|
||||
|
||||
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL) {
|
||||
AlignmentContext mergedContext = AlignmentContextUtils.joinContexts(contexts.values());
|
||||
Map<String,AlignmentContext> newContext = new HashMap<String,AlignmentContext>();
|
||||
newContext.put(DUMMY_SAMPLE_NAME,mergedContext);
|
||||
contexts = newContext;
|
||||
}
|
||||
|
||||
// get initial alleles to genotype
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>();
|
||||
if (allAllelesToUse == null || allAllelesToUse.isEmpty())
|
||||
allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse));
|
||||
else
|
||||
allAlleles.addAll(allAllelesToUse);
|
||||
|
||||
if (allAlleles.isEmpty())
|
||||
return null;
|
||||
|
||||
final ArrayList<PoolGenotypeData> GLs = new ArrayList<PoolGenotypeData>(contexts.size());
|
||||
|
||||
for ( Map.Entry<String, AlignmentContext> sample : contexts.entrySet() ) {
|
||||
// skip reference sample
|
||||
if (UAC.referenceSampleName != null && sample.getKey().equals(UAC.referenceSampleName))
|
||||
continue;
|
||||
|
||||
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
|
||||
|
||||
// create the GenotypeLikelihoods object
|
||||
final PoolGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO);
|
||||
// actually compute likelihoods
|
||||
final int nGoodBases = GL.add(pileup, UAC);
|
||||
if ( nGoodBases > 0 )
|
||||
// create wrapper object for likelihoods and add to list
|
||||
GLs.add(new PoolGenotypeData(sample.getKey(), GL, getFilteredDepth(pileup), allAlleles));
|
||||
}
|
||||
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
|
||||
|
||||
// start making the VariantContext
|
||||
final GenomeLoc loc = ref.getLocus();
|
||||
final int endLoc = getEndLocation(tracker, ref, alleles);
|
||||
|
||||
final VariantContextBuilder builder = new VariantContextBuilder("UG_call", loc.getContig(), loc.getStart(), endLoc, alleles);
|
||||
builder.alleles(alleles);
|
||||
|
||||
final HashMap<String, Object> attributes = new HashMap<String, Object>();
|
||||
|
||||
if (UAC.referenceSampleName != null && perLaneErrorModels != null)
|
||||
attributes.put(VCFConstants.REFSAMPLE_DEPTH_KEY, ErrorModel.getTotalReferenceDepth(perLaneErrorModels));
|
||||
|
||||
builder.attributes(attributes);
|
||||
// create the genotypes; no-call everyone for now
|
||||
final GenotypesContext genotypes = GenotypesContext.create();
|
||||
final List<Allele> noCall = new ArrayList<Allele>();
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
for ( PoolGenotypeData sampleData : GLs ) {
|
||||
// extract from multidimensional array
|
||||
final double[] myLikelihoods = PoolGenotypeLikelihoods.subsetToAlleles(sampleData.GL.getLikelihoods(),sampleData.GL.numChromosomes,
|
||||
allAlleles, alleles);
|
||||
|
||||
// normalize in log space so that max element is zero.
|
||||
final GenotypeBuilder gb = new GenotypeBuilder(sampleData.name, noCall);
|
||||
gb.DP(sampleData.depth);
|
||||
gb.PL(MathUtils.normalizeFromLog10(myLikelihoods, false, true));
|
||||
genotypes.add(gb.make());
|
||||
}
|
||||
|
||||
return builder.genotypes(genotypes).make();
|
||||
|
||||
}
|
||||
|
||||
|
||||
protected HashMap<String, ErrorModel> getPerLaneErrorModels(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts) {
|
||||
VariantContext refVC = getTrueAlleles(tracker, ref, contexts);
|
||||
|
||||
|
||||
// Build error model for site based on reference sample, and keep stratified for each lane.
|
||||
AlignmentContext refContext = null;
|
||||
if (UAC.referenceSampleName != null)
|
||||
refContext = contexts.get(UAC.referenceSampleName);
|
||||
|
||||
ReadBackedPileup refPileup = null;
|
||||
if (refContext != null) {
|
||||
HashMap<String, ErrorModel> perLaneErrorModels = new HashMap<String, ErrorModel>();
|
||||
refPileup = refContext.getBasePileup();
|
||||
|
||||
Set<String> laneIDs = new TreeSet<String>();
|
||||
if (UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO)
|
||||
laneIDs.add(DUMMY_LANE);
|
||||
else
|
||||
laneIDs = parseLaneIDs(refPileup.getReadGroups());
|
||||
// build per-lane error model for all lanes present in ref sample
|
||||
for (String laneID : laneIDs) {
|
||||
// get reference pileup for this lane
|
||||
ReadBackedPileup refLanePileup = refPileup;
|
||||
// subset for this lane
|
||||
if (refPileup != null && !(UAC.TREAT_ALL_READS_AS_SINGLE_POOL || UAC.IGNORE_LANE_INFO))
|
||||
refLanePileup = refPileup.getPileupForLane(laneID);
|
||||
|
||||
//ReferenceSample referenceSample = new ReferenceSample(UAC.referenceSampleName, refLanePileup, trueReferenceAlleles);
|
||||
perLaneErrorModels.put(laneID, new ErrorModel(UAC.minQualityScore, UAC.maxQualityScore, UAC.phredScaledPrior, refLanePileup, refVC, UAC.minPower));
|
||||
}
|
||||
return perLaneErrorModels;
|
||||
|
||||
}
|
||||
else
|
||||
return null;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
Abstract methods - must be implemented in derived classes
|
||||
*/
|
||||
|
||||
protected abstract PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation);
|
||||
|
||||
protected abstract List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final List<Allele> allAllelesToUse);
|
||||
|
||||
protected abstract List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final ArrayList<PoolGenotypeData> GLs);
|
||||
|
||||
protected abstract int getEndLocation(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> alternateAllelesToUse);
|
||||
}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.HaplotypeIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
|
||||
public class PoolGenotypePriors implements GenotypePriors {
|
||||
private final double[] flatPriors;
|
||||
private final double heterozygosity;
|
||||
private final int samplesPerPool;
|
||||
private double[] priors = null;
|
||||
|
||||
/**
|
||||
* Create a new DiploidGenotypePriors object with flat priors for each diploid genotype
|
||||
*/
|
||||
public PoolGenotypePriors(double heterozygosity, int samplesPerPool) {
|
||||
flatPriors = new double[2*samplesPerPool+1];
|
||||
for (int k=0; k <flatPriors.length; k++)
|
||||
flatPriors[k] = Math.log10(heterozygosity);
|
||||
priors = flatPriors.clone();
|
||||
this.samplesPerPool = samplesPerPool;
|
||||
|
||||
this.heterozygosity = heterozygosity;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns an array of priors for each genotype, indexed by DiploidGenotype.ordinal values().
|
||||
*
|
||||
* @return log10 prior as a double array
|
||||
*/
|
||||
public double[] getPriors() {
|
||||
return priors;
|
||||
}
|
||||
|
||||
public double getHeterozygosity() { return heterozygosity; }
|
||||
public int getNSamplesPerPool() { return samplesPerPool; }
|
||||
|
||||
public boolean validate(boolean throwException) {
|
||||
try {
|
||||
|
||||
for (int i=0; i < priors.length; i++ ) {
|
||||
if ( ! MathUtils.wellFormedDouble(priors[i]) || ! MathUtils.isNegativeOrZero(priors[i]) ) {
|
||||
String bad = String.format("Prior %f is badly formed %b", priors[i], MathUtils.isNegativeOrZero(priors[i]));
|
||||
throw new IllegalStateException(String.format("At %d: %s", i, bad));
|
||||
}
|
||||
}
|
||||
} catch ( IllegalStateException e ) {
|
||||
if ( throwException )
|
||||
throw new RuntimeException(e);
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,217 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 5/18/12
|
||||
* Time: 10:06 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PoolIndelGenotypeLikelihoods extends PoolGenotypeLikelihoods {
|
||||
final PairHMMIndelErrorModel pairModel;
|
||||
final LinkedHashMap<Allele, Haplotype> haplotypeMap;
|
||||
final ReferenceContext refContext;
|
||||
final int eventLength;
|
||||
double[][] readHaplotypeLikelihoods;
|
||||
|
||||
public PoolIndelGenotypeLikelihoods(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean ignoreLaneInformation,
|
||||
final PairHMMIndelErrorModel pairModel,
|
||||
final LinkedHashMap<Allele, Haplotype> haplotypeMap,
|
||||
final ReferenceContext referenceContext) {
|
||||
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
|
||||
this.pairModel = pairModel;
|
||||
this.haplotypeMap = haplotypeMap;
|
||||
this.refContext = referenceContext;
|
||||
this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// add() routines. These are the workhorse routines for calculating the overall genotype
|
||||
// likelihoods given observed bases and reads. Includes high-level operators all the
|
||||
// way down to single base and qual functions.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup read pileup
|
||||
* @param UAC the minimum base quality at which to consider a base valid
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
|
||||
int n = 0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error models
|
||||
return add(pileup, (ErrorModel)null);
|
||||
}
|
||||
for (String laneID : perLaneErrorModels.keySet() ) {
|
||||
// get pileup for this lane
|
||||
ReadBackedPileup perLanePileup;
|
||||
if (ignoreLaneInformation)
|
||||
perLanePileup = pileup;
|
||||
else
|
||||
perLanePileup = pileup.getPileupForLane(laneID);
|
||||
|
||||
if (perLanePileup == null || perLanePileup.isEmpty())
|
||||
continue;
|
||||
|
||||
ErrorModel errorModel = perLaneErrorModels.get(laneID);
|
||||
n += add(perLanePileup, errorModel);
|
||||
if (ignoreLaneInformation)
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the pool's probability for all possible allele counts for all indel alleles observed.
|
||||
* Calculation is based on the error model
|
||||
* generated by the reference sample on the same lane. The probability is given by :
|
||||
*
|
||||
* Pr(ac = j1,j2,.. | pool, errorModel) = sum_over_all_Qs ( Pr(j1,j2,.. * Pr(errorModel_q) *
|
||||
* Pr(ac=j1,j2,..| pool, errorModel) = sum_over_all_Qs ( Pr(ac=j1,j2,..) * Pr(errorModel_q) *
|
||||
* [j1 * (1-eq)/2n + eq/3*(2*N-j1)
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
|
||||
*
|
||||
* log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
|
||||
* = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
|
||||
* )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
|
||||
* + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
|
||||
*
|
||||
* Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
|
||||
*
|
||||
* Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
|
||||
* nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
* If pileup data comes from several error models (because lanes can have different error models),
|
||||
* Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
|
||||
* = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
|
||||
*
|
||||
* So, for each lane, build error model and combine lanes.
|
||||
* To store model, can do
|
||||
* for jA=0:2N
|
||||
* for jC = 0:2N-jA
|
||||
* for jG = 0:2N-jA-jC
|
||||
* for jT = 0:2N-jA-jC-jG
|
||||
* Q(jA,jC,jG,jT)
|
||||
* for k = minSiteQual:maxSiteQual
|
||||
* likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
*
|
||||
*
|
||||
* where: nA,nC,nG,nT = counts of bases observed in pileup.
|
||||
*
|
||||
*
|
||||
* @param pileup Base pileup
|
||||
* @param errorModel Site error model
|
||||
* @return Number of bases added
|
||||
*/
|
||||
private int add(ReadBackedPileup pileup, ErrorModel errorModel) {
|
||||
int n=0;
|
||||
|
||||
// Number of alleless in pileup, in that order
|
||||
List<Integer> numSeenBases = new ArrayList<Integer>(this.alleles.size());
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
final int numHaplotypes = haplotypeMap.size();
|
||||
|
||||
final int readCounts[] = new int[pileup.getNumberOfElements()];
|
||||
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, PoolIndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts);
|
||||
n = readHaplotypeLikelihoods.length;
|
||||
} else {
|
||||
Allele refAllele = null;
|
||||
for (Allele a:alleles) {
|
||||
numSeenBases.add(0);
|
||||
if (a.isReference())
|
||||
refAllele = a;
|
||||
}
|
||||
|
||||
if (refAllele == null)
|
||||
throw new ReviewedStingException("BUG: no ref alleles in passed in allele list!");
|
||||
|
||||
// count number of elements in pileup
|
||||
for (PileupElement elt : pileup) {
|
||||
if (VERBOSE)
|
||||
System.out.format("base:%s isNextToDel:%b isNextToIns:%b eventBases:%s eventLength:%d\n",elt.getBase(), elt.isBeforeDeletionStart(),elt.isBeforeInsertion(),elt.getEventBases(),elt.getEventLength());
|
||||
int idx =0;
|
||||
for (Allele allele : alleles) {
|
||||
int cnt = numSeenBases.get(idx);
|
||||
numSeenBases.set(idx++,cnt + (ErrorModel.pileupElementMatches(elt, allele, refAllele)?1:0));
|
||||
}
|
||||
|
||||
n++;
|
||||
|
||||
}
|
||||
}
|
||||
computeLikelihoods(errorModel, alleles, numSeenBases, pileup);
|
||||
return n;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Compute likelihood of current conformation
|
||||
*
|
||||
* @param ACset Count to compute
|
||||
* @param errorModel Site-specific error model object
|
||||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size());
|
||||
double p1 = 0.0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error model: use pair HMM likelihoods
|
||||
for (int i=0; i < readHaplotypeLikelihoods.length; i++) {
|
||||
double acc[] = new double[alleleList.size()];
|
||||
for (int k=0; k < acc.length; k++ )
|
||||
acc[k] = readHaplotypeLikelihoods[i][k] + MathUtils.log10Cache[currentCnt[k]]-LOG10_PLOIDY;
|
||||
p1 += MathUtils.log10sumLog10(acc);
|
||||
}
|
||||
|
||||
} else {
|
||||
final int minQ = errorModel.getMinSignificantQualityScore();
|
||||
final int maxQ = errorModel.getMaxSignificantQualityScore();
|
||||
final double[] acVec = new double[maxQ - minQ + 1];
|
||||
|
||||
|
||||
for (int k=minQ; k<=maxQ; k++) {
|
||||
int idx=0;
|
||||
for (int n : numObservations)
|
||||
acVec[k-minQ] += n*logMismatchProbabilityArray[currentCnt[idx++]][k];
|
||||
}
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,132 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PoolIndelGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
|
||||
private static final int MAX_NUM_ALLELES_TO_GENOTYPE = 4;
|
||||
|
||||
private PairHMMIndelErrorModel pairModel;
|
||||
private boolean allelesArePadded = false;
|
||||
/*
|
||||
private static ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>> indelLikelihoodMap =
|
||||
new ThreadLocal<HashMap<PileupElement, LinkedHashMap<Allele, Double>>>() {
|
||||
protected synchronized HashMap<PileupElement, LinkedHashMap<Allele, Double>> initialValue() {
|
||||
return new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
|
||||
}
|
||||
};
|
||||
*/
|
||||
|
||||
private LinkedHashMap<Allele, Haplotype> haplotypeMap;
|
||||
|
||||
/*
|
||||
static {
|
||||
indelLikelihoodMap.set(new HashMap<PileupElement, LinkedHashMap<Allele, Double>>());
|
||||
}
|
||||
*/
|
||||
|
||||
protected PoolIndelGenotypeLikelihoodsCalculationModel(final UnifiedArgumentCollection UAC, final Logger logger) {
|
||||
super(UAC, logger);
|
||||
|
||||
|
||||
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
|
||||
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION);
|
||||
haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
|
||||
}
|
||||
|
||||
|
||||
public static HashMap<PileupElement, LinkedHashMap<Allele, Double>> getIndelLikelihoodMap() {
|
||||
return IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap();
|
||||
}
|
||||
|
||||
|
||||
|
||||
protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation){
|
||||
return new PoolIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref);
|
||||
}
|
||||
|
||||
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final List<Allele> allAllelesToUse){
|
||||
|
||||
|
||||
final Pair<List<Allele>,Boolean> pair = IndelGenotypeLikelihoodsCalculationModel.getInitialAlleleList(tracker, ref, contexts, contextType, locParser, UAC,true);
|
||||
List<Allele> alleles = pair.first;
|
||||
|
||||
if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE)
|
||||
alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE);
|
||||
allelesArePadded = pair.second;
|
||||
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
|
||||
IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear();
|
||||
haplotypeMap.clear();
|
||||
}
|
||||
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap);
|
||||
return alleles;
|
||||
|
||||
}
|
||||
|
||||
protected List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final ArrayList<PoolGenotypeData> GLs) {
|
||||
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
if ( allAllelesToUse != null )
|
||||
alleles.addAll(allAllelesToUse);
|
||||
else if (!GLs.isEmpty())
|
||||
alleles.addAll(GLs.get(0).alleles);
|
||||
return alleles;
|
||||
|
||||
}
|
||||
|
||||
protected int getEndLocation(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allelesToUse) {
|
||||
return IndelGenotypeLikelihoodsCalculationModel.computeEndLocation(allelesToUse, ref.getLocus(), allelesArePadded);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,350 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import static java.lang.Math.log10;
|
||||
import static java.lang.Math.pow;
|
||||
|
||||
|
||||
/**
|
||||
* Stable, error checking version of the pool genotyper. Useful for calculating the likelihoods, priors,
|
||||
* and posteriors given a pile of bases and quality scores
|
||||
*
|
||||
*/
|
||||
public class PoolSNPGenotypeLikelihoods extends PoolGenotypeLikelihoods/* implements Cloneable*/ {
|
||||
|
||||
final List<Allele> myAlleles;
|
||||
final int[] alleleIndices;
|
||||
final boolean useBAQedPileup;
|
||||
final byte refByte;
|
||||
int mbq;
|
||||
//final double[] PofDGivenBase;
|
||||
|
||||
protected static final double[][][] qualLikelihoodCache;
|
||||
/**
|
||||
* Create a new GenotypeLikelhoods object with given priors and PCR error rate for each pool genotype
|
||||
* @param alleles Alleles associated with this likelihood object
|
||||
* @param logLikelihoods Likelihoods (can be null if no likelihoods known)
|
||||
* @param ploidy Ploidy of sample (# of chromosomes)
|
||||
* @param perLaneErrorModels error model objects for each lane
|
||||
* @param useBQAedPileup Use BAQed pileup
|
||||
* @param ignoreLaneInformation If true, lane info is ignored
|
||||
*/
|
||||
public PoolSNPGenotypeLikelihoods(final List<Allele> alleles, final double[] logLikelihoods, final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels, final boolean useBQAedPileup,final boolean ignoreLaneInformation) {
|
||||
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
|
||||
this.useBAQedPileup = useBQAedPileup;
|
||||
|
||||
myAlleles = new ArrayList<Allele>(alleles);
|
||||
|
||||
refByte = alleles.get(0).getBases()[0]; // by construction, first allele in list is always ref!
|
||||
|
||||
if (myAlleles.size() < BaseUtils.BASES.length) {
|
||||
// likelihood only defined for subset of possible alleles. Fill then with other alleles to have all possible ones,
|
||||
for (byte b : BaseUtils.BASES) {
|
||||
// if base is not included in myAlleles, add new allele
|
||||
boolean isRef = (b==refByte);
|
||||
if (!myAlleles.contains(Allele.create(b,isRef)))
|
||||
myAlleles.add(Allele.create(b,isRef));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
// compute permutation vector to figure out mapping from indices to bases
|
||||
int idx = 0;
|
||||
alleleIndices = new int[myAlleles.size()];
|
||||
for (byte b : BaseUtils.BASES) {
|
||||
boolean isRef = (b==refByte);
|
||||
alleleIndices[idx++] = myAlleles.indexOf(Allele.create(b,isRef));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// -------------------------------------------------------------------------------------
|
||||
//
|
||||
// add() routines. These are the workhorse routines for calculating the overall genotype
|
||||
// likelihoods given observed bases and reads. Includes high-level operators all the
|
||||
// way down to single base and qual functions.
|
||||
//
|
||||
// -------------------------------------------------------------------------------------
|
||||
|
||||
public int add(ReadBackedPileup pileup, UnifiedArgumentCollection UAC) {
|
||||
mbq = UAC.MIN_BASE_QUALTY_SCORE; // record for later use
|
||||
return add(pileup, true, true, mbq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates likelihoods and posteriors to reflect the additional observations contained within the
|
||||
* read-based pileup up by calling add(observedBase, qualityScore) for each base / qual in the
|
||||
* pileup
|
||||
*
|
||||
* @param pileup read pileup
|
||||
* @param ignoreBadBases should we ignore bad bases?
|
||||
* @param capBaseQualsAtMappingQual should we cap a base's quality by its read's mapping quality?
|
||||
* @param minBaseQual the minimum base quality at which to consider a base valid
|
||||
* @return the number of good bases found in the pileup
|
||||
*/
|
||||
public int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
int n = 0;
|
||||
|
||||
if ( useBAQedPileup )
|
||||
pileup = createBAQedPileup( pileup );
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
return add(pileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, null);
|
||||
}
|
||||
|
||||
for (String laneID : perLaneErrorModels.keySet() ) {
|
||||
// get pileup for this lane
|
||||
ReadBackedPileup perLanePileup;
|
||||
if (ignoreLaneInformation)
|
||||
perLanePileup = pileup;
|
||||
else
|
||||
perLanePileup = pileup.getPileupForLane(laneID);
|
||||
|
||||
if (perLanePileup == null || perLanePileup.isEmpty())
|
||||
continue;
|
||||
|
||||
ErrorModel errorModel = perLaneErrorModels.get(laneID);
|
||||
n += add(perLanePileup, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual, errorModel);
|
||||
if (ignoreLaneInformation)
|
||||
break;
|
||||
|
||||
}
|
||||
|
||||
return n;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculates the pool's probability for all possible allele counts for all bases. Calculation is based on the error model
|
||||
* generated by the reference sample on the same lane. The probability is given by :
|
||||
*
|
||||
* Pr(ac=jA,jC,jG,jT| pool, errorModel) = sum_over_all_Qs ( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT
|
||||
*
|
||||
* log Pr(ac=jA,jC,jG,jT| pool, errorModel) = logsum( Pr(ac=jA,jC,jG,jT) * Pr(errorModel_q) *
|
||||
* [jA*(1-eq)/2n + eq/3*(jc+jg+jt)/2N)^nA * jC*(1-eq)/2n + eq/3*(ja+jg+jt)/2N)^nC *
|
||||
* jG*(1-eq)/2n + eq/3*(jc+ja+jt)/2N)^nG * jT*(1-eq)/2n + eq/3*(jc+jg+ja)/2N)^nT)
|
||||
* = logsum(logPr(ac=jA,jC,jG,jT) + log(Pr(error_Model(q)
|
||||
* )) + nA*log(jA/2N(1-eq)+eq/3*(2N-jA)/2N) + nC*log(jC/2N(1-eq)+eq/3*(2N-jC)/2N)
|
||||
* + log(jG/2N(1-eq)+eq/3*(2N-jG)/2N) + log(jT/2N(1-eq)+eq/3*(2N-jT)/2N)
|
||||
*
|
||||
* Let Q(j,k) = log(j/2N*(1-e[k]) + (2N-j)/2N*e[k]/3)
|
||||
*
|
||||
* Then logPr(ac=jA,jC,jG,jT|D,errorModel) = logPR(ac=Ja,jC,jG,jT) + logsum_k( logPr (errorModel[k],
|
||||
* nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
* If pileup data comes from several error models (because lanes can have different error models),
|
||||
* Pr(Ac=j|D,E1,E2) = sum(Pr(AC1=j1|D,E1,E2) * Pr(AC2=j-j2|D,E1,E2))
|
||||
* = sum(Pr(AC1=j1|D,E1)*Pr(AC2=j-j1|D,E2)) from j=0..2N
|
||||
*
|
||||
* So, for each lane, build error model and combine lanes.
|
||||
* To store model, can do
|
||||
* for jA=0:2N
|
||||
* for jC = 0:2N-jA
|
||||
* for jG = 0:2N-jA-jC
|
||||
* for jT = 0:2N-jA-jC-jG
|
||||
* Q(jA,jC,jG,jT)
|
||||
* for k = minSiteQual:maxSiteQual
|
||||
* likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
|
||||
*
|
||||
*
|
||||
*
|
||||
* where: nA,nC,nG,nT = counts of bases observed in pileup.
|
||||
*
|
||||
*
|
||||
* @param pileup Base pileup
|
||||
* @param ignoreBadBases Whether to ignore bad bases
|
||||
* @param capBaseQualsAtMappingQual Cap base at mapping qual
|
||||
* @param minBaseQual Minimum base quality to consider
|
||||
* @param errorModel Site error model
|
||||
* @return Number of bases added
|
||||
*/
|
||||
private int add(ReadBackedPileup pileup, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual, ErrorModel errorModel) {
|
||||
// Number of [A C G T]'s in pileup, in that order
|
||||
List<Integer> numSeenBases = new ArrayList<Integer>(BaseUtils.BASES.length);
|
||||
for (byte b: BaseUtils.BASES)
|
||||
numSeenBases.add(0);
|
||||
|
||||
if (hasReferenceSampleData) {
|
||||
// count number of elements in pileup
|
||||
for (PileupElement elt : pileup) {
|
||||
byte obsBase = elt.getBase();
|
||||
byte qual = qualToUse(elt, ignoreBadBases, capBaseQualsAtMappingQual, minBaseQual);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
|
||||
int idx = 0;
|
||||
|
||||
for (byte base:BaseUtils.BASES) {
|
||||
int cnt = numSeenBases.get(idx);
|
||||
numSeenBases.set(idx++,cnt + (base == obsBase?1:0));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
if (VERBOSE)
|
||||
System.out.format("numSeenBases: %d %d %d %d\n",numSeenBases.get(0),numSeenBases.get(1),numSeenBases.get(2),numSeenBases.get(3));
|
||||
}
|
||||
computeLikelihoods(errorModel, myAlleles, numSeenBases, pileup);
|
||||
return pileup.getNumberOfElements();
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute likelihood of current conformation
|
||||
*
|
||||
* @param ACset Count to compute
|
||||
* @param errorModel Site-specific error model object
|
||||
* @param alleleList List of alleles
|
||||
* @param numObservations Number of observations for each allele in alleleList
|
||||
*/
|
||||
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset,
|
||||
final ErrorModel errorModel,
|
||||
final List<Allele> alleleList,
|
||||
final List<Integer> numObservations,
|
||||
final ReadBackedPileup pileup) {
|
||||
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length);
|
||||
final int[] ac = new int[BaseUtils.BASES.length];
|
||||
|
||||
for (int k=0; k < BaseUtils.BASES.length; k++ )
|
||||
ac[k] = currentCnt[alleleIndices[k]];
|
||||
|
||||
double p1 = 0.0;
|
||||
|
||||
if (!hasReferenceSampleData) {
|
||||
// no error model: loop throught pileup to compute likalihoods just on base qualities
|
||||
for (final PileupElement elt : pileup) {
|
||||
final byte obsBase = elt.getBase();
|
||||
final byte qual = qualToUse(elt, true, true, mbq);
|
||||
if ( qual == 0 )
|
||||
continue;
|
||||
final double acc[] = new double[ACset.ACcounts.counts.length];
|
||||
for (int k=0; k < acc.length; k++ )
|
||||
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]]
|
||||
- LOG10_PLOIDY;
|
||||
p1 += MathUtils.log10sumLog10(acc);
|
||||
}
|
||||
}
|
||||
else {
|
||||
final int minQ = errorModel.getMinSignificantQualityScore();
|
||||
final int maxQ = errorModel.getMaxSignificantQualityScore();
|
||||
final double[] acVec = new double[maxQ - minQ + 1];
|
||||
|
||||
final int nA = numObservations.get(0);
|
||||
final int nC = numObservations.get(1);
|
||||
final int nG = numObservations.get(2);
|
||||
final int nT = numObservations.get(3);
|
||||
|
||||
|
||||
for (int k=minQ; k<=maxQ; k++)
|
||||
acVec[k-minQ] = nA*logMismatchProbabilityArray[ac[0]][k] +
|
||||
nC*logMismatchProbabilityArray[ac[1]][k] +
|
||||
nG*logMismatchProbabilityArray[ac[2]][k] +
|
||||
nT*logMismatchProbabilityArray[ac[3]][k];
|
||||
|
||||
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec);
|
||||
}
|
||||
ACset.log10Likelihoods[0] = p1;
|
||||
/* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1));
|
||||
System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ)));
|
||||
*/
|
||||
}
|
||||
|
||||
public ReadBackedPileup createBAQedPileup( final ReadBackedPileup pileup ) {
|
||||
final List<PileupElement> BAQedElements = new ArrayList<PileupElement>();
|
||||
for( final PileupElement PE : pileup ) {
|
||||
final PileupElement newPE = new BAQedPileupElement( PE );
|
||||
BAQedElements.add( newPE );
|
||||
}
|
||||
return new ReadBackedPileupImpl( pileup.getLocation(), BAQedElements );
|
||||
}
|
||||
|
||||
public class BAQedPileupElement extends PileupElement {
|
||||
public BAQedPileupElement( final PileupElement PE ) {
|
||||
super(PE.getRead(), PE.getOffset(), PE.isDeletion(), PE.isBeforeDeletedBase(), PE.isAfterDeletedBase(), PE.isBeforeInsertion(), PE.isAfterInsertion(), PE.isNextToSoftClip());
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte getQual( final int offset ) { return BAQ.calcBAQFromTag(getRead(), offset, true); }
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Helper function that returns the phred-scaled base quality score we should use for calculating
|
||||
* likelihoods for a pileup element. May return 0 to indicate that the observation is bad, and may
|
||||
* cap the quality score by the mapping quality of the read itself.
|
||||
*
|
||||
* @param p Pileup element
|
||||
* @param ignoreBadBases Flag to ignore bad bases
|
||||
* @param capBaseQualsAtMappingQual Whether to cap base Q at mapping quality
|
||||
* @param minBaseQual Min qual to use
|
||||
* @return New phred-scaled base quality
|
||||
*/
|
||||
private static byte qualToUse(PileupElement p, boolean ignoreBadBases, boolean capBaseQualsAtMappingQual, int minBaseQual) {
|
||||
if ( ignoreBadBases && !BaseUtils.isRegularBase( p.getBase() ) )
|
||||
return 0;
|
||||
|
||||
byte qual = p.getQual();
|
||||
|
||||
if ( qual > SAMUtils.MAX_PHRED_SCORE )
|
||||
throw new UserException.MalformedBAM(p.getRead(), String.format("the maximum allowed quality score is %d, but a quality of %d was observed in read %s. Perhaps your BAM incorrectly encodes the quality scores in Sanger format; see http://en.wikipedia.org/wiki/FASTQ_format for more details", SAMUtils.MAX_PHRED_SCORE, qual, p.getRead().getReadName()));
|
||||
if ( capBaseQualsAtMappingQual )
|
||||
qual = (byte)Math.min((int)qual, p.getMappingQual());
|
||||
if ( (int)qual < minBaseQual )
|
||||
qual = (byte)0;
|
||||
|
||||
return qual;
|
||||
}
|
||||
|
||||
static {
|
||||
qualLikelihoodCache = new double[BaseUtils.BASES.length][BaseUtils.BASES.length][1+SAMUtils.MAX_PHRED_SCORE];
|
||||
for (byte j=0; j <= SAMUtils.MAX_PHRED_SCORE; j++) {
|
||||
for (byte b1:BaseUtils.BASES) {
|
||||
for (byte b2:BaseUtils.BASES) {
|
||||
qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(b1)][BaseUtils.simpleBaseToBaseIndex(b2)][j] = log10PofObservingBaseGivenChromosome(b1,b2,j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param observedBase observed base
|
||||
* @param chromBase target base
|
||||
* @param qual base quality
|
||||
* @return log10 likelihood
|
||||
*/
|
||||
private static double log10PofObservingBaseGivenChromosome(byte observedBase, byte chromBase, byte qual) {
|
||||
final double log10_3 = log10(3.0);
|
||||
double logP;
|
||||
|
||||
if ( observedBase == chromBase ) {
|
||||
// the base is consistent with the chromosome -- it's 1 - e
|
||||
//logP = oneMinusData[qual];
|
||||
double e = pow(10, (qual / -10.0));
|
||||
logP = log10(1.0 - e);
|
||||
} else {
|
||||
// the base is inconsistent with the chromosome -- it's e * P(chromBase | observedBase is an error)
|
||||
logP = qual / -10.0 + (-log10_3);
|
||||
}
|
||||
|
||||
//System.out.printf("%c %c %d => %f%n", observedBase, chromBase, qual, logP);
|
||||
return logP;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,128 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class PoolSNPGenotypeLikelihoodsCalculationModel extends PoolGenotypeLikelihoodsCalculationModel {
|
||||
|
||||
|
||||
protected PoolSNPGenotypeLikelihoodsCalculationModel( UnifiedArgumentCollection UAC, Logger logger) {
|
||||
super(UAC, logger);
|
||||
|
||||
}
|
||||
|
||||
protected PoolGenotypeLikelihoods getPoolGenotypeLikelihoodObject(final List<Allele> alleles,
|
||||
final double[] logLikelihoods,
|
||||
final int ploidy,
|
||||
final HashMap<String, ErrorModel> perLaneErrorModels,
|
||||
final boolean useBQAedPileup,
|
||||
final ReferenceContext ref,
|
||||
final boolean ignoreLaneInformation) {
|
||||
return new PoolSNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO);
|
||||
}
|
||||
|
||||
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
Map<String, AlignmentContext> contexts,
|
||||
final AlignmentContextUtils.ReadOrientation contextType,
|
||||
final GenomeLocParser locParser,
|
||||
final List<Allele> allAllelesToUse) {
|
||||
|
||||
if (allAllelesToUse != null)
|
||||
return allAllelesToUse;
|
||||
|
||||
|
||||
final byte refBase = ref.getBase();
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>();
|
||||
// first add ref allele
|
||||
allAlleles.add(Allele.create(refBase, true));
|
||||
// add all possible alt alleles
|
||||
for (byte b: BaseUtils.BASES) {
|
||||
if (refBase != b)
|
||||
allAlleles.add(Allele.create(b));
|
||||
}
|
||||
|
||||
return allAlleles;
|
||||
}
|
||||
|
||||
protected List<Allele> getFinalAllelesToUse(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> allAllelesToUse,
|
||||
final ArrayList<PoolGenotypeData> GLs) {
|
||||
// find the alternate allele(s) that we should be using
|
||||
final List<Allele> alleles = new ArrayList<Allele>();
|
||||
if ( allAllelesToUse != null ) {
|
||||
alleles.addAll(allAllelesToUse);
|
||||
} else if ( UAC.GenotypingMode == GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
final VariantContext vc = UnifiedGenotyperEngine.getVCFromAllelesRod(tracker, ref, ref.getLocus(), true, logger, UAC.alleles);
|
||||
|
||||
// ignore places where we don't have a SNP
|
||||
if ( vc == null || !vc.isSNP() )
|
||||
return null;
|
||||
|
||||
alleles.addAll(vc.getAlleles());
|
||||
} else {
|
||||
|
||||
alleles.add(Allele.create(ref.getBase(),true));
|
||||
alleles.addAll(determineAlternateAlleles( GLs));
|
||||
|
||||
// if there are no non-ref alleles...
|
||||
if ( alleles.size() == 1 ) {
|
||||
final int indexOfRefBase = BaseUtils.simpleBaseToBaseIndex(ref.getBase());
|
||||
// if we only want variants, then we don't need to calculate genotype likelihoods
|
||||
if ( UAC.OutputMode != UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY )
|
||||
// otherwise, choose any alternate allele (it doesn't really matter)
|
||||
alleles.add(Allele.create(BaseUtils.baseIndexToSimpleBase(indexOfRefBase == 0 ? 1 : 0)));
|
||||
}
|
||||
}
|
||||
return alleles;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param tracker dummy parameter here
|
||||
* @param ref Reference context
|
||||
* @param alternateAllelesToUse alt allele list
|
||||
* @return end location for vc to be created
|
||||
*/
|
||||
protected int getEndLocation(final RefMetaDataTracker tracker,
|
||||
final ReferenceContext ref,
|
||||
final List<Allele> alternateAllelesToUse) {
|
||||
// for SNPs, end loc is is the same as start loc
|
||||
return ref.getLocus().getStart();
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,159 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 4/11/12
|
||||
* Time: 10:25 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class ProbabilityVector {
|
||||
private final double[] probabilityArray;
|
||||
private final int minVal;
|
||||
private final int maxVal;
|
||||
|
||||
final static double LOG_DYNAMIC_RANGE = 10; // values X below max vector value will be removed
|
||||
|
||||
/**
|
||||
* Default constructor: take vector in log-space, with support from range [0,len-1]
|
||||
* @param vec Probability (or likelihood) vector in log space
|
||||
* @param compressRange If true, compress by eliminating edges with little support
|
||||
*/
|
||||
public ProbabilityVector(double[] vec, boolean compressRange) {
|
||||
|
||||
int maxValIdx = MathUtils.maxElementIndex(vec);
|
||||
double maxv = vec[maxValIdx];
|
||||
if (maxv > 0.0)
|
||||
throw new ReviewedStingException("BUG: Attempting to create a log-probability vector with positive elements");
|
||||
|
||||
if (compressRange) {
|
||||
minVal = getMinIdx(vec, maxValIdx);
|
||||
maxVal = getMaxIdx(vec, maxValIdx);
|
||||
probabilityArray = Arrays.copyOfRange(vec, minVal, maxVal+1);
|
||||
|
||||
} else {
|
||||
probabilityArray = vec;
|
||||
minVal = 0;
|
||||
maxVal = vec.length-1;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public ProbabilityVector(double[] vec) {
|
||||
this(vec,true);
|
||||
}
|
||||
|
||||
public ProbabilityVector(ProbabilityVector other, boolean compressRange) {
|
||||
// create new probability vector from other.
|
||||
this(other.getUncompressedProbabilityVector(), compressRange);
|
||||
|
||||
}
|
||||
public int getMinVal() { return minVal;}
|
||||
public int getMaxVal() { return maxVal;}
|
||||
public double[] getProbabilityVector() { return probabilityArray;}
|
||||
|
||||
public double[] getProbabilityVector(int minVal, int maxVal) {
|
||||
// get vector in specified range. If range is outside of current vector, fill with negative infinities
|
||||
double[] x = new double[maxVal - minVal + 1];
|
||||
|
||||
for (int k=minVal; k <= maxVal; k++)
|
||||
x[k-minVal] = getLogProbabilityForIndex(k);
|
||||
|
||||
|
||||
return x;
|
||||
}
|
||||
|
||||
public double[] getUncompressedProbabilityVector() {
|
||||
double x[] = new double[maxVal+1];
|
||||
|
||||
for (int i=0; i < minVal; i++)
|
||||
x[i] = Double.NEGATIVE_INFINITY;
|
||||
for (int i=minVal; i <=maxVal; i++)
|
||||
x[i] = probabilityArray[i-minVal];
|
||||
|
||||
return x;
|
||||
}
|
||||
/**
|
||||
* Return log Probability for original index i
|
||||
* @param idx Index to probe
|
||||
* @return log10(Pr X = i) )
|
||||
*/
|
||||
public double getLogProbabilityForIndex(int idx) {
|
||||
if (idx < minVal || idx > maxVal)
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
else
|
||||
return probabilityArray[idx-minVal];
|
||||
}
|
||||
|
||||
//public ProbabilityVector
|
||||
public static ProbabilityVector compressVector(double[] vec ) {
|
||||
return new ProbabilityVector(vec, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine left-most index where a vector exceeds (max Value - DELTA)
|
||||
* @param vec Input vector
|
||||
* @param maxValIdx Index to stop - usually index with max value in vector
|
||||
* @return Min index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE
|
||||
*/
|
||||
private static int getMinIdx(double[] vec, int maxValIdx) {
|
||||
int edgeIdx;
|
||||
for (edgeIdx=0; edgeIdx<=maxValIdx; edgeIdx++ ) {
|
||||
if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE)
|
||||
break;
|
||||
}
|
||||
|
||||
return edgeIdx;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine right-most index where a vector exceeds (max Value - DELTA)
|
||||
* @param vec Input vector
|
||||
* @param maxValIdx Index to stop - usually index with max value in vector
|
||||
* @return Max index where vector > vec[maxValIdx]-LOG_DYNAMIC_RANGE
|
||||
*/
|
||||
private static int getMaxIdx(double[] vec, int maxValIdx) {
|
||||
int edgeIdx;
|
||||
for (edgeIdx=vec.length-1; edgeIdx>=maxValIdx; edgeIdx-- ) {
|
||||
if (vec[edgeIdx] > vec[maxValIdx]-LOG_DYNAMIC_RANGE)
|
||||
break;
|
||||
}
|
||||
|
||||
return edgeIdx;
|
||||
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param other
|
||||
* @return
|
||||
*/
|
||||
public double logDotProduct(ProbabilityVector other) {
|
||||
// find overlap in range
|
||||
int minRange = Math.max(this.minVal, other.getMinVal());
|
||||
int maxRange = Math.min(this.maxVal, other.getMaxVal());
|
||||
if (minRange > maxRange)
|
||||
return Double.NEGATIVE_INFINITY;
|
||||
|
||||
// x = 0,1,2, y = 2,3,4. minRange = 2, maxRange = 2
|
||||
double[] result = new double[maxRange - minRange+1];
|
||||
for (int k=0; k <= maxRange-minRange; k++) {
|
||||
int startI = minRange - this.minVal;
|
||||
int startJ = minRange - other.getMinVal();
|
||||
result[k] = this.probabilityArray[k+startI] + other.probabilityArray[k+startJ];
|
||||
|
||||
|
||||
|
||||
}
|
||||
return MathUtils.approximateLog10SumLog10(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
|
||||
// simple edge class for connecting nodes in the graph
|
||||
public class DeBruijnEdge implements Comparable<DeBruijnEdge> {
|
||||
|
||||
private int multiplicity;
|
||||
private boolean isRef;
|
||||
|
||||
public DeBruijnEdge() {
|
||||
multiplicity = 1;
|
||||
isRef = false;
|
||||
}
|
||||
|
||||
public DeBruijnEdge( final boolean isRef ) {
|
||||
multiplicity = 1;
|
||||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
public DeBruijnEdge( final boolean isRef, final int multiplicity ) {
|
||||
this.multiplicity = multiplicity;
|
||||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
public int getMultiplicity() {
|
||||
return multiplicity;
|
||||
}
|
||||
|
||||
public void setMultiplicity( final int value ) {
|
||||
multiplicity = value;
|
||||
}
|
||||
|
||||
public boolean getIsRef() {
|
||||
return isRef;
|
||||
}
|
||||
|
||||
public void setIsRef( final boolean isRef ) {
|
||||
this.isRef = isRef;
|
||||
}
|
||||
|
||||
public boolean equals( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge ) {
|
||||
return (graph.getEdgeSource(this).equals(graph.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
public boolean equals( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge, final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph2 ) {
|
||||
return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo( final DeBruijnEdge that ) {
|
||||
return this.multiplicity - that.multiplicity;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
// simple node class for storing kmer sequences
|
||||
public class DeBruijnVertex {
|
||||
|
||||
protected final byte[] sequence;
|
||||
public final int kmer;
|
||||
|
||||
public DeBruijnVertex( final byte[] sequence, final int kmer ) {
|
||||
this.sequence = sequence;
|
||||
this.kmer = kmer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals( Object v ) {
|
||||
return v instanceof DeBruijnVertex && Arrays.equals(sequence, ((DeBruijnVertex) v).sequence);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() { // necessary to override here so that graph.containsVertex() works the same way as vertex.equals() as one might expect
|
||||
return Arrays.hashCode(sequence);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return new String(sequence);
|
||||
}
|
||||
|
||||
public String getSuffixString() {
|
||||
return new String( getSuffix() );
|
||||
}
|
||||
|
||||
public byte[] getSequence() {
|
||||
return sequence;
|
||||
}
|
||||
|
||||
public byte[] getSuffix() {
|
||||
return Arrays.copyOfRange( sequence, kmer - 1, sequence.length );
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,623 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import net.sf.samtools.Cigar;
|
||||
import net.sf.samtools.CigarElement;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFAlleleClipper;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class GenotypingEngine {
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final int MNP_LOOK_AHEAD;
|
||||
private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
|
||||
private final static List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("<UNASSEMBLED_EVENT>", false);
|
||||
|
||||
public GenotypingEngine( final boolean DEBUG, final int MNP_LOOK_AHEAD, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
|
||||
this.DEBUG = DEBUG;
|
||||
this.MNP_LOOK_AHEAD = MNP_LOOK_AHEAD;
|
||||
this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE;
|
||||
noCall.add(Allele.NO_CALL);
|
||||
}
|
||||
|
||||
// This function is the streamlined approach, currently not being used
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser ) {
|
||||
// Prepare the list of haplotype indices to genotype
|
||||
final ArrayList<Allele> allelesToGenotype = new ArrayList<Allele>();
|
||||
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) );
|
||||
}
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make());
|
||||
}
|
||||
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here
|
||||
|
||||
// Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF
|
||||
haplotypesToRemove.add(h);
|
||||
}
|
||||
}
|
||||
haplotypes.removeAll(haplotypesToRemove);
|
||||
|
||||
if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) {
|
||||
final List<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>> returnVCs = new ArrayList<Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>>();
|
||||
// set up the default 1-to-1 haplotype mapping object
|
||||
final HashMap<Allele,ArrayList<Haplotype>> haplotypeMapping = new HashMap<Allele,ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.put(call.getAllele(h.getBases()), list);
|
||||
}
|
||||
returnVCs.add( new Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>>(call,haplotypeMapping) );
|
||||
return returnVCs;
|
||||
}
|
||||
|
||||
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
int count = 0;
|
||||
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
}
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) );
|
||||
startPosKeySet.addAll(h.getEventMap().keySet());
|
||||
}
|
||||
|
||||
// Create the VC merge priority list
|
||||
final ArrayList<String> priorityList = new ArrayList<String>();
|
||||
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
|
||||
priorityList.add("HC" + iii);
|
||||
}
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
}
|
||||
}
|
||||
|
||||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final int myNumHaplotypes = alleleMapper.size();
|
||||
final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < myNumHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
|
||||
// using the allele mapping object translate the haplotype allele into the event allele
|
||||
final Genotype g = new GenotypeBuilder(sample)
|
||||
.alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes))
|
||||
.phased(loc != startPosKeySet.first())
|
||||
.PL(genotypeLikelihoods).make();
|
||||
myGenotypes.add(g);
|
||||
}
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(
|
||||
new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) );
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
}
|
||||
|
||||
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
|
||||
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, final ArrayList<Haplotype> haplotypes, final byte[] ref, final GenomeLoc refLoc,
|
||||
final GenomeLoc activeRegionWindow, final GenomeLocParser genomeLocParser, final ArrayList<VariantContext> activeAllelesToGenotype ) {
|
||||
|
||||
final ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> returnCalls = new ArrayList<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>>();
|
||||
|
||||
// Using the cigar from each called haplotype figure out what events need to be written out in a VCF file
|
||||
final TreeSet<Integer> startPosKeySet = new TreeSet<Integer>();
|
||||
int count = 0;
|
||||
if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); }
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// Walk along the alignment and turn any difference from the reference into an event
|
||||
h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++, MNP_LOOK_AHEAD ) );
|
||||
if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); }
|
||||
if( DEBUG ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
System.out.println( "> Left and right breaks = (" + h.leftBreakPoint + " , " + h.rightBreakPoint + ")");
|
||||
System.out.println( ">> Events = " + h.getEventMap());
|
||||
}
|
||||
}
|
||||
// Create the VC merge priority list
|
||||
final ArrayList<String> priorityList = new ArrayList<String>();
|
||||
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
|
||||
priorityList.add("HC" + iii);
|
||||
}
|
||||
|
||||
cleanUpSymbolicUnassembledEvents( haplotypes, priorityList );
|
||||
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure
|
||||
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
|
||||
}
|
||||
if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
startPosKeySet.add( compVC.getStart() );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Walk along each position in the key set and create each event to be outputted
|
||||
for( final int loc : startPosKeySet ) {
|
||||
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
|
||||
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>();
|
||||
if( activeAllelesToGenotype.isEmpty() ) {
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
|
||||
final VariantContext vc = eventMap.get(loc);
|
||||
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
|
||||
eventsAtThisLoc.add(vc);
|
||||
}
|
||||
}
|
||||
} else { // we are in GGA mode!
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) {
|
||||
if( compVC.getStart() == loc ) {
|
||||
priorityList.clear();
|
||||
int alleleCount = 0;
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
HashSet<Allele> alleleSet = new HashSet<Allele>(2);
|
||||
alleleSet.add(compVC.getReference());
|
||||
alleleSet.add(compAltAllele);
|
||||
priorityList.add("Allele" + alleleCount);
|
||||
eventsAtThisLoc.add(new VariantContextBuilder(compVC).alleles(alleleSet).source("Allele"+alleleCount).make());
|
||||
alleleCount++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( eventsAtThisLoc.isEmpty() ) { continue; }
|
||||
|
||||
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
|
||||
|
||||
// Merge the event to find a common reference representation
|
||||
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
|
||||
if( mergedVC == null ) { continue; }
|
||||
|
||||
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
|
||||
int aCount = 0;
|
||||
for( final Allele a : mergedVC.getAlleles() ) {
|
||||
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles());
|
||||
//System.out.println("Event/haplotype allele mapping = " + alleleMapper);
|
||||
}
|
||||
|
||||
// Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample
|
||||
final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size());
|
||||
for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples
|
||||
final int numHaplotypes = alleleMapper.size();
|
||||
final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2];
|
||||
final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper);
|
||||
int glIndex = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC
|
||||
}
|
||||
}
|
||||
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
|
||||
|
||||
if( call != null ) {
|
||||
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
|
||||
}
|
||||
}
|
||||
}
|
||||
return returnCalls;
|
||||
}
|
||||
|
||||
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes, final ArrayList<String> priorityList ) {
|
||||
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
|
||||
final ArrayList<String> stringsToRemove = new ArrayList<String>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
for( final VariantContext vc : h.getEventMap().values() ) {
|
||||
if( vc.isSymbolic() ) {
|
||||
for( final Haplotype h2 : haplotypes ) {
|
||||
for( final VariantContext vc2 : h2.getEventMap().values() ) {
|
||||
if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) {
|
||||
haplotypesToRemove.add(h);
|
||||
stringsToRemove.add(vc.getSource());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
haplotypes.removeAll(haplotypesToRemove);
|
||||
priorityList.removeAll(stringsToRemove);
|
||||
}
|
||||
|
||||
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
final int MAX_SIZE_TO_COMBINE = 10;
|
||||
final double MERGE_EVENTS_R2_THRESHOLD = 0.95;
|
||||
if( startPosKeySet.size() <= 1 ) { return; }
|
||||
|
||||
boolean mapWasUpdated = true;
|
||||
while( mapWasUpdated ) {
|
||||
mapWasUpdated = false;
|
||||
|
||||
// loop over the set of start locations and consider pairs that start near each other
|
||||
final Iterator<Integer> iter = startPosKeySet.iterator();
|
||||
int thisStart = iter.next();
|
||||
while( iter.hasNext() ) {
|
||||
final int nextStart = iter.next();
|
||||
if( nextStart - thisStart < MAX_SIZE_TO_COMBINE) {
|
||||
boolean isBiallelic = true;
|
||||
VariantContext thisVC = null;
|
||||
VariantContext nextVC = null;
|
||||
int x11 = 0;
|
||||
int x12 = 0;
|
||||
int x21 = 0;
|
||||
int x22 = 0;
|
||||
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
// only make complex substitutions out of consecutive biallelic sites
|
||||
final VariantContext thisHapVC = h.getEventMap().get(thisStart);
|
||||
if( thisHapVC != null && !thisHapVC.isSymbolic() ) { // something was found at this location on this haplotype
|
||||
if( thisVC == null ) {
|
||||
thisVC = thisHapVC;
|
||||
} else if( !thisHapVC.hasSameAllelesAs( thisVC ) ) {
|
||||
isBiallelic = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
final VariantContext nextHapVC = h.getEventMap().get(nextStart);
|
||||
if( nextHapVC != null && !nextHapVC.isSymbolic() ) { // something was found at the next location on this haplotype
|
||||
if( nextVC == null ) {
|
||||
nextVC = nextHapVC;
|
||||
} else if( !nextHapVC.hasSameAllelesAs( nextVC ) ) {
|
||||
isBiallelic = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// count up the co-occurrences of the events for the R^2 calculation
|
||||
// BUGBUG: use haplotype likelihoods per-sample to make this more accurate
|
||||
if( thisHapVC == null ) {
|
||||
if( nextHapVC == null ) { x11++; }
|
||||
else { x12++; }
|
||||
} else {
|
||||
if( nextHapVC == null ) { x21++; }
|
||||
else { x22++; }
|
||||
}
|
||||
}
|
||||
if( thisVC == null || nextVC == null ) {
|
||||
continue;
|
||||
//throw new ReviewedStingException("StartPos TreeSet has an entry for an event that is found on no haplotype. start pos = " + thisStart + ", next pos = " + nextStart);
|
||||
}
|
||||
if( isBiallelic ) {
|
||||
final double R2 = calculateR2LD( x11, x12, x21, x22 );
|
||||
if( DEBUG ) {
|
||||
System.out.println("Found consecutive biallelic events with R^2 = " + String.format("%.4f", R2));
|
||||
System.out.println("-- " + thisVC);
|
||||
System.out.println("-- " + nextVC);
|
||||
}
|
||||
if( R2 > MERGE_EVENTS_R2_THRESHOLD ) {
|
||||
|
||||
final VariantContext mergedVC = createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
|
||||
// remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer, VariantContext> eventMap = h.getEventMap();
|
||||
if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) {
|
||||
eventMap.remove(thisStart);
|
||||
eventMap.remove(nextStart);
|
||||
eventMap.put(mergedVC.getStart(), mergedVC);
|
||||
}
|
||||
}
|
||||
startPosKeySet.add(mergedVC.getStart());
|
||||
boolean containsStart = false;
|
||||
boolean containsNext = false;
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final HashMap<Integer, VariantContext> eventMap = h.getEventMap();
|
||||
if( eventMap.containsKey(thisStart) ) { containsStart = true; }
|
||||
if( eventMap.containsKey(nextStart) ) { containsNext = true; }
|
||||
}
|
||||
if(!containsStart) { startPosKeySet.remove(thisStart); }
|
||||
if(!containsNext) { startPosKeySet.remove(nextStart); }
|
||||
|
||||
if( DEBUG ) { System.out.println("====> " + mergedVC); }
|
||||
mapWasUpdated = true;
|
||||
break; // break out of tree set iteration since it was just updated, start over from the beginning and keep merging events
|
||||
}
|
||||
}
|
||||
}
|
||||
thisStart = nextStart;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// BUGBUG: make this merge function more general
|
||||
protected static VariantContext createMergedVariantContext( final VariantContext thisVC, final VariantContext nextVC, final byte[] ref, final GenomeLoc refLoc ) {
|
||||
final int thisStart = thisVC.getStart();
|
||||
final int nextStart = nextVC.getStart();
|
||||
byte[] refBases = ( thisVC.hasReferenceBaseForIndel() ? new byte[]{ thisVC.getReferenceBaseForIndel() } : new byte[]{} );
|
||||
byte[] altBases = ( thisVC.hasReferenceBaseForIndel() ? new byte[]{ thisVC.getReferenceBaseForIndel() } : new byte[]{} );
|
||||
refBases = ArrayUtils.addAll(refBases, thisVC.getReference().getBases());
|
||||
altBases = ArrayUtils.addAll(altBases, thisVC.getAlternateAllele(0).getBases());
|
||||
for( int locus = thisStart + refBases.length; locus < nextStart; locus++ ) {
|
||||
final byte refByte = ref[locus - refLoc.getStart()];
|
||||
refBases = ArrayUtils.add(refBases, refByte);
|
||||
altBases = ArrayUtils.add(altBases, refByte);
|
||||
}
|
||||
if( nextVC.hasReferenceBaseForIndel() ) {
|
||||
refBases = ArrayUtils.add(refBases, nextVC.getReferenceBaseForIndel());
|
||||
altBases = ArrayUtils.add(altBases, nextVC.getReferenceBaseForIndel());
|
||||
}
|
||||
refBases = ArrayUtils.addAll(refBases, nextVC.getReference().getBases());
|
||||
altBases = ArrayUtils.addAll(altBases, nextVC.getAlternateAllele(0).getBases());
|
||||
|
||||
int iii = 0;
|
||||
if( refBases.length == altBases.length && VCFAlleleClipper.needsPadding(thisVC) ) { // special case of insertion + deletion of same length creates an MNP --> trim padding bases off the allele
|
||||
while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; }
|
||||
}
|
||||
final ArrayList<Allele> mergedAlleles = new ArrayList<Allele>();
|
||||
mergedAlleles.add( Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ) );
|
||||
mergedAlleles.add( Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ) );
|
||||
return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), mergedAlleles).make();
|
||||
}
|
||||
|
||||
@Requires({"x11 >= 0", "x12 >= 0", "x21 >= 0", "x22 >= 0"})
|
||||
protected static double calculateR2LD( final int x11, final int x12, final int x21, final int x22 ) {
|
||||
final int total = x11 + x12 + x21 + x22;
|
||||
final double pa1b1 = ((double) x11) / ((double) total);
|
||||
final double pa1b2 = ((double) x12) / ((double) total);
|
||||
final double pa2b1 = ((double) x21) / ((double) total);
|
||||
final double pa1 = pa1b1 + pa1b2;
|
||||
final double pb1 = pa1b1 + pa2b1;
|
||||
return ((pa1b1 - pa1*pb1) * (pa1b1 - pa1*pb1)) / ( pa1 * (1.0 - pa1) * pb1 * (1.0 - pb1) );
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"})
|
||||
@Ensures({"result.size() == eventsAtThisLoc.size() + 1"})
|
||||
protected static ArrayList<ArrayList<Haplotype>> createAlleleMapper( final int loc, final ArrayList<VariantContext> eventsAtThisLoc, final ArrayList<Haplotype> haplotypes ) {
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
|
||||
final ArrayList<Haplotype> refList = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype
|
||||
refList.add(h);
|
||||
} else {
|
||||
boolean foundInEventList = false;
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
foundInEventList = true;
|
||||
}
|
||||
}
|
||||
if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype
|
||||
refList.add(h);
|
||||
}
|
||||
}
|
||||
}
|
||||
alleleMapper.add(refList);
|
||||
for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) {
|
||||
list.add(h);
|
||||
}
|
||||
}
|
||||
alleleMapper.add(list);
|
||||
}
|
||||
return alleleMapper;
|
||||
}
|
||||
|
||||
@Ensures({"result.size() == haplotypeAllelesForSample.size()"})
|
||||
protected static List<Allele> findEventAllelesInSample( final List<Allele> eventAlleles, final List<Allele> haplotypeAlleles, final List<Allele> haplotypeAllelesForSample, final ArrayList<ArrayList<Haplotype>> alleleMapper, final ArrayList<Haplotype> haplotypes ) {
|
||||
if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; }
|
||||
final ArrayList<Allele> eventAllelesForSample = new ArrayList<Allele>();
|
||||
for( final Allele a : haplotypeAllelesForSample ) {
|
||||
final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a));
|
||||
for( int iii = 0; iii < alleleMapper.size(); iii++ ) {
|
||||
final ArrayList<Haplotype> mappedHaplotypes = alleleMapper.get(iii);
|
||||
if( mappedHaplotypes.contains(haplotype) ) {
|
||||
eventAllelesForSample.add(eventAlleles.get(iii));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return eventAllelesForSample;
|
||||
}
|
||||
|
||||
protected static boolean containsVCWithMatchingAlleles( final List<VariantContext> list, final VariantContext vcToTest ) {
|
||||
for( final VariantContext vc : list ) {
|
||||
if( vc.hasSameAllelesAs(vcToTest) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
protected static HashMap<Integer,VariantContext> generateVCsFromAlignment( final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) {
|
||||
return generateVCsFromAlignment(null, alignmentStartHapwrtRef, cigar, ref, alignment, refLoc, sourceNameToAdd, MNP_LOOK_AHEAD); // BUGBUG: needed for compatibility with HaplotypeResolver code
|
||||
}
|
||||
|
||||
protected static HashMap<Integer,VariantContext> generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd, final int MNP_LOOK_AHEAD ) {
|
||||
final HashMap<Integer,VariantContext> vcs = new HashMap<Integer,VariantContext>();
|
||||
|
||||
int refPos = alignmentStartHapwrtRef;
|
||||
if( refPos < 0 ) { return null; } // Protection against SW failures
|
||||
int alignmentPos = 0;
|
||||
|
||||
for( final CigarElement ce : cigar.getCigarElements() ) {
|
||||
final int elementLength = ce.getLength();
|
||||
switch( ce.getOperator() ) {
|
||||
case I:
|
||||
final byte[] insertionBases = Arrays.copyOfRange( alignment, alignmentPos, alignmentPos + elementLength );
|
||||
boolean allN = true;
|
||||
for( final byte b : insertionBases ) {
|
||||
if( b != (byte) 'N' ) {
|
||||
allN = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !allN ) {
|
||||
final ArrayList<Allele> insertionAlleles = new ArrayList<Allele>();
|
||||
final int insertionStart = refLoc.getStart() + refPos - 1;
|
||||
if( haplotype != null && (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1 || haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() - 1 == insertionStart + elementLength + 1) ) {
|
||||
insertionAlleles.add( Allele.create(ref[refPos-1], true) );
|
||||
insertionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
|
||||
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).make());
|
||||
} else {
|
||||
insertionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, true) );
|
||||
insertionAlleles.add( Allele.create(insertionBases, false) );
|
||||
vcs.put(insertionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), insertionStart, insertionStart, insertionAlleles).referenceBaseForIndel(ref[refPos-1]).make());
|
||||
}
|
||||
|
||||
}
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
case S:
|
||||
alignmentPos += elementLength;
|
||||
break;
|
||||
case D:
|
||||
final byte[] deletionBases = Arrays.copyOfRange( ref, refPos, refPos + elementLength );
|
||||
final ArrayList<Allele> deletionAlleles = new ArrayList<Allele>();
|
||||
final int deletionStart = refLoc.getStart() + refPos - 1;
|
||||
// BUGBUG: how often does this symbolic deletion allele case happen?
|
||||
//if( haplotype != null && ( (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength)
|
||||
// || (haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.rightBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength) ) ) {
|
||||
// deletionAlleles.add( Allele.create(ref[refPos-1], true) );
|
||||
// deletionAlleles.add( SYMBOLIC_UNASSEMBLED_EVENT_ALLELE );
|
||||
// vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart, deletionAlleles).make());
|
||||
//} else {
|
||||
deletionAlleles.add( Allele.create(deletionBases, true) );
|
||||
deletionAlleles.add( Allele.create(Allele.NULL_ALLELE_STRING, false) );
|
||||
vcs.put(deletionStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), deletionStart, deletionStart + elementLength, deletionAlleles).referenceBaseForIndel(ref[refPos-1]).make());
|
||||
//}
|
||||
refPos += elementLength;
|
||||
break;
|
||||
case M:
|
||||
int numSinceMismatch = -1;
|
||||
int stopOfMismatch = -1;
|
||||
int startOfMismatch = -1;
|
||||
int refPosStartOfMismatch = -1;
|
||||
for( int iii = 0; iii < elementLength; iii++ ) {
|
||||
if( ref[refPos] != alignment[alignmentPos] && alignment[alignmentPos] != ((byte) 'N') ) {
|
||||
// SNP or start of possible MNP
|
||||
if( stopOfMismatch == -1 ) {
|
||||
startOfMismatch = alignmentPos;
|
||||
stopOfMismatch = alignmentPos;
|
||||
numSinceMismatch = 0;
|
||||
refPosStartOfMismatch = refPos;
|
||||
} else {
|
||||
stopOfMismatch = alignmentPos;
|
||||
}
|
||||
}
|
||||
if( stopOfMismatch != -1) {
|
||||
numSinceMismatch++;
|
||||
}
|
||||
if( numSinceMismatch > MNP_LOOK_AHEAD || (iii == elementLength - 1 && stopOfMismatch != -1) ) {
|
||||
final byte[] refBases = Arrays.copyOfRange( ref, refPosStartOfMismatch, refPosStartOfMismatch + (stopOfMismatch - startOfMismatch) + 1 );
|
||||
final byte[] mismatchBases = Arrays.copyOfRange( alignment, startOfMismatch, stopOfMismatch + 1 );
|
||||
final ArrayList<Allele> snpAlleles = new ArrayList<Allele>();
|
||||
snpAlleles.add( Allele.create( refBases, true ) );
|
||||
snpAlleles.add( Allele.create( mismatchBases, false ) );
|
||||
final int snpStart = refLoc.getStart() + refPosStartOfMismatch;
|
||||
vcs.put(snpStart, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), snpStart, snpStart + (stopOfMismatch - startOfMismatch), snpAlleles).make());
|
||||
numSinceMismatch = -1;
|
||||
stopOfMismatch = -1;
|
||||
startOfMismatch = -1;
|
||||
refPosStartOfMismatch = -1;
|
||||
}
|
||||
refPos++;
|
||||
alignmentPos++;
|
||||
}
|
||||
break;
|
||||
case N:
|
||||
case H:
|
||||
case P:
|
||||
default:
|
||||
throw new ReviewedStingException( "Unsupported cigar operator created during SW alignment: " + ce.getOperator() );
|
||||
}
|
||||
}
|
||||
return vcs;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,566 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import net.sf.picard.reference.IndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.filters.BadMateFilter;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension;
|
||||
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
|
||||
import org.broadinstitute.sting.gatk.walkers.PartitionType;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.clipping.ReadClipper;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentCollection;
|
||||
import org.broadinstitute.sting.utils.fragments.FragmentUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Call SNPs and indels simultaneously via local de-novo assembly of haplotypes in an active region. Haplotypes are evaluated using an affine gap penalty Pair HMM.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* Input bam file(s) from which to make calls
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* VCF file with raw, unrecalibrated SNP and indel calls.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java
|
||||
* -jar GenomeAnalysisTK.jar
|
||||
* -T HaplotypeCaller
|
||||
* -R reference/human_g1k_v37.fasta
|
||||
* -I sample1.bam [-I sample2.bam ...] \
|
||||
* --dbsnp dbSNP.vcf \
|
||||
* -stand_call_conf [50.0] \
|
||||
* -stand_emit_conf 10.0 \
|
||||
* [-L targets.interval_list]
|
||||
* -o output.raw.snps.indels.vcf
|
||||
* </pre>
|
||||
*
|
||||
* <h2>Caveats</h2>
|
||||
* <ul>
|
||||
* <li>The system is under active and continuous development. All outputs, the underlying likelihood model, and command line arguments are likely to change often.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @author rpoplin
|
||||
* @since 8/22/11
|
||||
*/
|
||||
|
||||
@PartitionBy(PartitionType.LOCUS)
|
||||
@ActiveRegionExtension(extension=65, maxRegion=275)
|
||||
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatible {
|
||||
|
||||
/**
|
||||
* A raw, unfiltered, highly sensitive callset in VCF format.
|
||||
*/
|
||||
@Output(doc="File to which variants should be written", required = true)
|
||||
protected VariantContextWriter vcfWriter = null;
|
||||
|
||||
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
|
||||
protected PrintStream graphWriter = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
|
||||
protected String keepRG = null;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="mnpLookAhead", shortName="mnpLookAhead", doc = "The number of bases to combine together to form MNPs out of nearby consecutive SNPs on the same haplotype", required = false)
|
||||
protected int MNP_LOOK_AHEAD = 0;
|
||||
|
||||
@Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false)
|
||||
protected int MIN_PRUNE_FACTOR = 1;
|
||||
|
||||
@Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false)
|
||||
protected boolean GENOTYPE_FULL_ACTIVE_REGION = false;
|
||||
|
||||
@Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false)
|
||||
protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false;
|
||||
|
||||
@Advanced
|
||||
@Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Gap continuation penalty for use in the Pair HMM", required = false)
|
||||
protected int gcpHMM = 10;
|
||||
|
||||
@Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false)
|
||||
protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000;
|
||||
|
||||
@Argument(fullName="useExpandedTriggerSet", shortName="expandedTriggers", doc = "If specified, use additional, experimental triggers designed to capture larger indels but which may lead to an increase in the false positive rate", required=false)
|
||||
protected boolean USE_EXPANDED_TRIGGER_SET = false;
|
||||
|
||||
@Argument(fullName="useAllelesTrigger", shortName="allelesTrigger", doc = "If specified, use additional trigger on variants found in an external alleles file", required=false)
|
||||
protected boolean USE_ALLELES_TRIGGER = false;
|
||||
|
||||
/**
|
||||
* rsIDs from this file are used to populate the ID column of the output. Also, the DB INFO flag will be set when appropriate.
|
||||
* dbSNP is not used in any way for the calculations themselves.
|
||||
*/
|
||||
@ArgumentCollection
|
||||
protected DbsnpArgumentCollection dbsnp = new DbsnpArgumentCollection();
|
||||
public RodBinding<VariantContext> getDbsnpRodBinding() { return dbsnp.dbsnp; }
|
||||
|
||||
/**
|
||||
* If a call overlaps with a record from the provided comp track, the INFO field will be annotated
|
||||
* as such in the output with the track name (e.g. -comp:FOO will have 'FOO' in the INFO field).
|
||||
* Records that are filtered in the comp track will be ignored.
|
||||
* Note that 'dbSNP' has been special-cased (see the --dbsnp argument).
|
||||
*/
|
||||
@Input(fullName="comp", shortName = "comp", doc="comparison VCF file", required=false)
|
||||
public List<RodBinding<VariantContext>> comps = Collections.emptyList();
|
||||
public List<RodBinding<VariantContext>> getCompRodBindings() { return comps; }
|
||||
|
||||
// The following are not used by the Unified Genotyper
|
||||
public RodBinding<VariantContext> getSnpEffRodBinding() { return null; }
|
||||
public List<RodBinding<VariantContext>> getResourceRodBindings() { return Collections.emptyList(); }
|
||||
public boolean alwaysAppendDbsnpId() { return false; }
|
||||
|
||||
/**
|
||||
* Which annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available annotations.
|
||||
*/
|
||||
@Argument(fullName="annotation", shortName="A", doc="One or more specific annotations to apply to variant calls", required=false)
|
||||
protected List<String> annotationsToUse = new ArrayList<String>(Arrays.asList(new String[]{"ClippingRankSumTest"}));
|
||||
|
||||
/**
|
||||
* Which annotations to exclude from output in the VCF file. Note that this argument has higher priority than the -A or -G arguments,
|
||||
* so annotations will be excluded even if they are explicitly included with the other options.
|
||||
*/
|
||||
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
|
||||
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"HaplotypeScore", "MappingQualityZero", "SpanningDeletions", "TandemRepeatAnnotator"}));
|
||||
|
||||
/**
|
||||
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
|
||||
*/
|
||||
@Argument(fullName="group", shortName="G", doc="One or more classes/groups of annotations to apply to variant calls", required=false)
|
||||
protected String[] annotationClassesToUse = { "Standard" };
|
||||
|
||||
@ArgumentCollection
|
||||
private UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
|
||||
// the calculation arguments
|
||||
private UnifiedGenotyperEngine UG_engine = null;
|
||||
private UnifiedGenotyperEngine UG_engine_simple_genotyper = null;
|
||||
|
||||
@Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information about each triggering active region", required = false)
|
||||
protected boolean DEBUG;
|
||||
|
||||
// the assembly engine
|
||||
LocalAssemblyEngine assemblyEngine = null;
|
||||
|
||||
// the likelihoods engine
|
||||
LikelihoodCalculationEngine likelihoodCalculationEngine = null;
|
||||
|
||||
// the genotyping engine
|
||||
GenotypingEngine genotypingEngine = null;
|
||||
|
||||
// the annotation engine
|
||||
private VariantAnnotatorEngine annotationEngine;
|
||||
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
private IndexedFastaSequenceFile referenceReader;
|
||||
|
||||
// reference base padding size
|
||||
private static final int REFERENCE_PADDING = 900;
|
||||
|
||||
// bases with quality less than or equal to this value are trimmed off the tails of the reads
|
||||
private static final byte MIN_TAIL_QUALITY = 20;
|
||||
|
||||
private ArrayList<String> samplesList = new ArrayList<String>();
|
||||
private final static double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private final static double LOG_ONE_THIRD = -Math.log10(3.0);
|
||||
private final ArrayList<VariantContext> allelesToGenotype = new ArrayList<VariantContext>();
|
||||
|
||||
private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
|
||||
private final static Allele FAKE_ALT_ALLELE = Allele.create("<FAKE_ALT>", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// initialize
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
public void initialize() {
|
||||
super.initialize();
|
||||
|
||||
// get all of the unique sample names
|
||||
Set<String> samples = SampleUtils.getSAMFileSamples(getToolkit().getSAMFileHeader());
|
||||
samplesList.addAll( samples );
|
||||
// initialize the UnifiedGenotyper Engine which is used to call into the exact model
|
||||
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_CALLING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING) ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = (USE_EXPANDED_TRIGGER_SET ? 0.3 : Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING) ); // low values used for isActive determination only, default/user-specified values used for actual calling
|
||||
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
|
||||
|
||||
// initialize the output VCF header
|
||||
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
|
||||
|
||||
Set<VCFHeaderLine> headerInfo = new HashSet<VCFHeaderLine>();
|
||||
|
||||
// all annotation fields from VariantAnnotatorEngine
|
||||
headerInfo.addAll(annotationEngine.getVCFAnnotationDescriptions());
|
||||
// all callers need to add these standard annotation header lines
|
||||
VCFStandardHeaderLines.addStandardInfoLines(headerInfo, true,
|
||||
VCFConstants.DOWNSAMPLED_KEY,
|
||||
VCFConstants.MLE_ALLELE_COUNT_KEY,
|
||||
VCFConstants.MLE_ALLELE_FREQUENCY_KEY);
|
||||
// all callers need to add these standard FORMAT field header lines
|
||||
VCFStandardHeaderLines.addStandardFormatLines(headerInfo, true,
|
||||
VCFConstants.GENOTYPE_KEY,
|
||||
VCFConstants.GENOTYPE_QUALITY_KEY,
|
||||
VCFConstants.DEPTH_KEY,
|
||||
VCFConstants.GENOTYPE_PL_KEY);
|
||||
// header lines for the experimental HaplotypeCaller-specific annotations
|
||||
headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX"));
|
||||
headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant"));
|
||||
|
||||
vcfWriter.writeHeader(new VCFHeader(headerInfo, samples));
|
||||
|
||||
try {
|
||||
// fasta reference reader to supplement the edges of the reference sequence
|
||||
referenceReader = new CachingIndexedFastaSequenceFile(getToolkit().getArguments().referenceFile);
|
||||
} catch( FileNotFoundException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(getToolkit().getArguments().referenceFile, e);
|
||||
}
|
||||
|
||||
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
|
||||
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false );
|
||||
genotypingEngine = new GenotypingEngine( DEBUG, MNP_LOOK_AHEAD, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// isActive
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
// enable deletions in the pileup
|
||||
@Override
|
||||
public boolean includeReadsWithDeletionAtLoci() { return true; }
|
||||
|
||||
// enable non primary reads in the active region
|
||||
@Override
|
||||
public boolean wantsNonPrimaryReads() { return true; }
|
||||
|
||||
@Override
|
||||
@Ensures({"result >= 0.0", "result <= 1.0"})
|
||||
public double isActive( final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context ) {
|
||||
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
|
||||
if( !allelesToGenotype.contains(vc) ) {
|
||||
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object
|
||||
}
|
||||
}
|
||||
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
|
||||
return 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
if( USE_ALLELES_TRIGGER ) {
|
||||
return ( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 );
|
||||
}
|
||||
|
||||
if( context == null ) { return 0.0; }
|
||||
|
||||
final List<Allele> noCall = new ArrayList<Allele>(); // used to noCall all genotypes until the exact model is applied
|
||||
noCall.add(Allele.NO_CALL);
|
||||
|
||||
final Map<String, AlignmentContext> splitContexts = AlignmentContextUtils.splitContextBySampleName(context);
|
||||
final GenotypesContext genotypes = GenotypesContext.create(splitContexts.keySet().size());
|
||||
for( final String sample : splitContexts.keySet() ) {
|
||||
final double[] genotypeLikelihoods = new double[3]; // ref versus non-ref (any event)
|
||||
Arrays.fill(genotypeLikelihoods, 0.0);
|
||||
|
||||
for( final PileupElement p : splitContexts.get(sample).getBasePileup() ) {
|
||||
final byte qual = ( USE_EXPANDED_TRIGGER_SET ?
|
||||
( p.isNextToSoftClip() || p.isBeforeInsertion() || p.isAfterInsertion() ? ( p.getQual() > QualityUtils.MIN_USABLE_Q_SCORE ? p.getQual() : (byte) 20 ) : p.getQual() )
|
||||
: p.getQual() );
|
||||
if( p.isDeletion() || qual > (USE_EXPANDED_TRIGGER_SET ? QualityUtils.MIN_USABLE_Q_SCORE : (byte) 18) ) {
|
||||
int AA = 0; final int AB = 1; int BB = 2;
|
||||
if( USE_EXPANDED_TRIGGER_SET ) {
|
||||
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ||
|
||||
(!p.getRead().getNGSPlatform().equals(NGSPlatform.SOLID) && ((p.getRead().getReadPairedFlag() && p.getRead().getMateUnmappedFlag()) || BadMateFilter.hasBadMate(p.getRead()))) ) {
|
||||
AA = 2;
|
||||
BB = 0;
|
||||
}
|
||||
} else {
|
||||
if( p.getBase() != ref.getBase() || p.isDeletion() || p.isBeforeDeletedBase() || p.isAfterDeletedBase() || p.isBeforeInsertion() || p.isAfterInsertion() || p.isNextToSoftClip() ) {
|
||||
AA = 2;
|
||||
BB = 0;
|
||||
}
|
||||
}
|
||||
genotypeLikelihoods[AA] += QualityUtils.qualToProbLog10(qual);
|
||||
genotypeLikelihoods[AB] += MathUtils.approximateLog10SumLog10( QualityUtils.qualToProbLog10(qual) + LOG_ONE_HALF, QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD + LOG_ONE_HALF );
|
||||
genotypeLikelihoods[BB] += QualityUtils.qualToErrorProbLog10(qual) + LOG_ONE_THIRD;
|
||||
}
|
||||
}
|
||||
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
|
||||
}
|
||||
|
||||
final ArrayList<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add( FAKE_REF_ALLELE );
|
||||
alleles.add( FAKE_ALT_ALLELE );
|
||||
final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL);
|
||||
return ( vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ) );
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// map
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public Integer map( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion, final RefMetaDataTracker metaDataTracker ) {
|
||||
|
||||
final ArrayList<VariantContext> activeAllelesToGenotype = new ArrayList<VariantContext>();
|
||||
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
|
||||
for( final VariantContext vc : allelesToGenotype ) {
|
||||
if( activeRegion.getLocation().overlapsP( getToolkit().getGenomeLocParser().createGenomeLoc(vc) ) ) {
|
||||
activeAllelesToGenotype.add(vc); // do something with these VCs during GGA mode
|
||||
}
|
||||
}
|
||||
allelesToGenotype.removeAll( activeAllelesToGenotype );
|
||||
}
|
||||
|
||||
if( !activeRegion.isActive ) { return 0; } // Not active so nothing to do!
|
||||
if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do!
|
||||
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do!
|
||||
|
||||
finalizeActiveRegion( activeRegion ); // merge overlapping fragments, clip adapter and low qual tails
|
||||
final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader)); // Create the reference haplotype which is the bases from the reference that make up the active region
|
||||
referenceHaplotype.setIsReference(true);
|
||||
final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING);
|
||||
//int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion ));
|
||||
final ArrayList<Haplotype> haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, getPaddedLoc(activeRegion), MIN_PRUNE_FACTOR, activeAllelesToGenotype );
|
||||
if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do!
|
||||
|
||||
activeRegion.hardClipToActiveRegion(); // only evaluate the parts of reads that are overlapping the active region
|
||||
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
|
||||
if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
|
||||
|
||||
// evaluate each sample's reads against all haplotypes
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
|
||||
likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, perSampleReadList );
|
||||
|
||||
// subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes )
|
||||
final ArrayList<Haplotype> bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes );
|
||||
|
||||
for( final Pair<VariantContext, HashMap<Allele, ArrayList<Haplotype>>> callResult :
|
||||
( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES
|
||||
? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser() )
|
||||
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
|
||||
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
|
||||
|
||||
final Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult );
|
||||
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
|
||||
|
||||
// add some custom annotations to the calls
|
||||
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());
|
||||
// Calculate the number of variants on the haplotype
|
||||
int maxNumVar = 0;
|
||||
for( final Allele allele : callResult.getFirst().getAlleles() ) {
|
||||
if( !allele.isReference() ) {
|
||||
for( final Haplotype haplotype : callResult.getSecond().get(allele) ) {
|
||||
final int numVar = haplotype.getEventMap().size();
|
||||
if( numVar > maxNumVar ) { maxNumVar = numVar; }
|
||||
}
|
||||
}
|
||||
}
|
||||
// Calculate the event length
|
||||
int maxLength = 0;
|
||||
for ( final Allele a : annotatedCall.getAlternateAlleles() ) {
|
||||
final int length = a.length() - annotatedCall.getReference().length();
|
||||
if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
|
||||
}
|
||||
|
||||
myAttributes.put("NVH", maxNumVar);
|
||||
myAttributes.put("NumHapEval", bestHaplotypes.size());
|
||||
myAttributes.put("NumHapAssembly", haplotypes.size());
|
||||
myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size());
|
||||
myAttributes.put("EVENTLENGTH", maxLength);
|
||||
myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") );
|
||||
myAttributes.put("extType", annotatedCall.getType().toString() );
|
||||
|
||||
//if( likelihoodCalculationEngine.haplotypeScore != null ) {
|
||||
// myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore));
|
||||
//}
|
||||
if( annotatedCall.hasAttribute("QD") ) {
|
||||
myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) );
|
||||
}
|
||||
|
||||
vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() );
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("----------------------------------------------------------------------------------"); }
|
||||
|
||||
return 1; // One active region was processed during this map call
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// reduce
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Override
|
||||
public Integer reduceInit() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Integer reduce(Integer cur, Integer sum) {
|
||||
return cur + sum;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onTraversalDone(Integer result) {
|
||||
logger.info("Ran local assembly on " + result + " active regions");
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// private helper functions
|
||||
//
|
||||
//---------------------------------------------------------------------------------------------------------------
|
||||
|
||||
private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getExtendedLoc() + " with " + activeRegion.size() + " reads:"); }
|
||||
final ArrayList<GATKSAMRecord> finalizedReadList = new ArrayList<GATKSAMRecord>();
|
||||
final FragmentCollection<GATKSAMRecord> fragmentCollection = FragmentUtils.create( ReadUtils.sortReadsByCoordinate(activeRegion.getReads()) );
|
||||
activeRegion.clearReads();
|
||||
|
||||
// Join overlapping paired reads to create a single longer read
|
||||
finalizedReadList.addAll( fragmentCollection.getSingletonReads() );
|
||||
for( final List<GATKSAMRecord> overlappingPair : fragmentCollection.getOverlappingPairs() ) {
|
||||
finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) );
|
||||
}
|
||||
|
||||
Collections.shuffle(finalizedReadList, GenomeAnalysisEngine.getRandomGenerator());
|
||||
|
||||
// Loop through the reads hard clipping the adaptor and low quality tails
|
||||
for( final GATKSAMRecord myRead : finalizedReadList ) {
|
||||
final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) );
|
||||
if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) {
|
||||
final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY );
|
||||
// protect against INTERVALS with abnormally high coverage
|
||||
if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) {
|
||||
activeRegion.add(clippedRead);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private List<GATKSAMRecord> filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
final ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>();
|
||||
for( final GATKSAMRecord rec : activeRegion.getReads() ) {
|
||||
if( rec.getReadLength() < 24 || rec.getMappingQuality() <= 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) {
|
||||
readsToRemove.add(rec);
|
||||
}
|
||||
}
|
||||
activeRegion.removeAll( readsToRemove );
|
||||
return readsToRemove;
|
||||
}
|
||||
|
||||
private GenomeLoc getPaddedLoc( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) {
|
||||
final int padLeft = Math.max(activeRegion.getReferenceLoc().getStart()-REFERENCE_PADDING, 1);
|
||||
final int padRight = Math.min(activeRegion.getReferenceLoc().getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(activeRegion.getReferenceLoc().getContig()).getSequenceLength());
|
||||
return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReferenceLoc().getContig(), padLeft, padRight);
|
||||
}
|
||||
|
||||
private HashMap<String, ArrayList<GATKSAMRecord>> splitReadsBySample( final List<GATKSAMRecord> reads ) {
|
||||
final HashMap<String, ArrayList<GATKSAMRecord>> returnMap = new HashMap<String, ArrayList<GATKSAMRecord>>();
|
||||
for( final String sample : samplesList) {
|
||||
ArrayList<GATKSAMRecord> readList = returnMap.get( sample );
|
||||
if( readList == null ) {
|
||||
readList = new ArrayList<GATKSAMRecord>();
|
||||
returnMap.put(sample, readList);
|
||||
}
|
||||
}
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
returnMap.get(read.getReadGroup().getSample()).add(read);
|
||||
}
|
||||
|
||||
return returnMap;
|
||||
}
|
||||
|
||||
/*
|
||||
private int determinePruneFactorFromCoverage( final ActiveRegion activeRegion ) {
|
||||
final ArrayList<Integer> readLengthDistribution = new ArrayList<Integer>();
|
||||
for( final GATKSAMRecord read : activeRegion.getReads() ) {
|
||||
readLengthDistribution.add(read.getReadLength());
|
||||
}
|
||||
final double meanReadLength = MathUtils.average(readLengthDistribution);
|
||||
final double meanCoveragePerSample = (double) activeRegion.getReads().size() / ((double) activeRegion.getExtendedLoc().size() / meanReadLength) / (double) samplesList.size();
|
||||
int PRUNE_FACTOR = 0;
|
||||
if( meanCoveragePerSample > 8.5 ) {
|
||||
PRUNE_FACTOR = (int) Math.floor( Math.sqrt( meanCoveragePerSample - 5.0 ) );
|
||||
} else if( meanCoveragePerSample > 3.0 ) {
|
||||
PRUNE_FACTOR = 1;
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println(String.format("Mean coverage per sample = %.1f --> prune factor = %d", meanCoveragePerSample, PRUNE_FACTOR)); }
|
||||
return PRUNE_FACTOR;
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
|
@ -0,0 +1,441 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.commandline.Output;
|
||||
import org.broadinstitute.sting.commandline.RodBinding;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
||||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Reference;
|
||||
import org.broadinstitute.sting.gatk.walkers.RodWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Window;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Haplotype-based resolution of variants in 2 different eval files.
|
||||
*
|
||||
* <p>
|
||||
* HaplotypeResolver is a tool that takes 2 VCF files and constructs haplotypes based on the variants inside them.
|
||||
* From that, it can resolve potential differences in variant calls that are inherently the same (or similar) variants.
|
||||
* Records are annotated with the set and status attributes.
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* 2 variant files to resolve.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Output</h2>
|
||||
* <p>
|
||||
* A single consensus VCF.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx1g -jar GenomeAnalysisTK.jar \
|
||||
* -R ref.fasta \
|
||||
* -T HaplotypeResolver \
|
||||
* -V:v1 input1.vcf \
|
||||
* -V:v2 input2.vcf \
|
||||
* -o output.vcf
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
@Reference(window=@Window(start=-HaplotypeResolver.ACTIVE_WINDOW,stop= HaplotypeResolver.ACTIVE_WINDOW))
|
||||
public class HaplotypeResolver extends RodWalker<Integer, Integer> {
|
||||
|
||||
protected static final String INTERSECTION_SET = "intersection";
|
||||
protected static final String SAME_STATUS = "same";
|
||||
protected static final String SOME_ALLELES_MATCH_STATUS = "someAllelesMatch";
|
||||
protected static final String SAME_START_DIFFERENT_ALLELES_STATUS = "sameStartDifferentAlleles";
|
||||
protected static final String SAME_BY_HAPLOTYPE_STATUS = "sameByHaplotype";
|
||||
protected static final String ONE_ALLELE_SUBSET_OF_OTHER_STATUS = "OneAlleleSubsetOfOther";
|
||||
protected static final String OVERLAPPING_EVENTS_STATUS = "overlappingEvents";
|
||||
|
||||
protected final static int MAX_DISTANCE_BETWEEN_MERGED_RECORDS = 50;
|
||||
protected final static int MAX_HAPLOTYPE_TO_CONSIDER = 1000;
|
||||
protected final static int MAX_VARIANT_SIZE_TO_CONSIDER = 100;
|
||||
protected final static int ACTIVE_WINDOW = MAX_HAPLOTYPE_TO_CONSIDER + MAX_VARIANT_SIZE_TO_CONSIDER;
|
||||
|
||||
@Input(fullName="variant", shortName = "V", doc="Input VCF file", required=true)
|
||||
public List<RodBinding<VariantContext>> variants;
|
||||
|
||||
@Output(doc="File to which variants should be written", required=true)
|
||||
protected VariantContextWriter baseWriter = null;
|
||||
private VariantContextWriter writer;
|
||||
|
||||
/**
|
||||
* Set to 'null' if you don't want the set field emitted.
|
||||
*/
|
||||
@Argument(fullName="setKey", shortName="setKey", doc="Key used in the INFO key=value tag emitted describing which set the combined VCF record came from", required=false)
|
||||
protected String SET_KEY = "set";
|
||||
|
||||
/**
|
||||
* Set to 'null' if you don't want the status field emitted.
|
||||
*/
|
||||
@Argument(fullName="statusKey", shortName="statusKey", doc="Key used in the INFO key=value tag emitted describing the extent to which records match", required=false)
|
||||
protected String STATUS_KEY = "status";
|
||||
|
||||
private final LinkedList<VCcontext> queue = new LinkedList<VCcontext>();
|
||||
private String source1, source2;
|
||||
private final List<VariantContext> sourceVCs1 = new ArrayList<VariantContext>();
|
||||
private final List<VariantContext> sourceVCs2 = new ArrayList<VariantContext>();
|
||||
|
||||
|
||||
private class VCcontext {
|
||||
public final Collection<VariantContext> vcs;
|
||||
public final GenomeLoc loc;
|
||||
public final ReferenceContext ref;
|
||||
|
||||
public VCcontext(final Collection<VariantContext> vcs, final ReferenceContext ref) {
|
||||
this.vcs = vcs;
|
||||
this.loc = getToolkit().getGenomeLocParser().createGenomeLoc(vcs.iterator().next());
|
||||
this.ref = ref;
|
||||
}
|
||||
}
|
||||
|
||||
public void initialize() {
|
||||
|
||||
if ( variants.size() != 2 ) {
|
||||
throw new UserException.BadArgumentValue("variant", "this tool requires exactly 2 input variant files");
|
||||
}
|
||||
source1 = variants.get(0).getName();
|
||||
source2 = variants.get(1).getName();
|
||||
|
||||
if ( SET_KEY.toLowerCase().equals("null") )
|
||||
SET_KEY = null;
|
||||
if ( STATUS_KEY.toLowerCase().equals("null") )
|
||||
STATUS_KEY = null;
|
||||
|
||||
// for now, INFO and FORMAT fields are not propagated to the output VCF (so they aren't put into the header)
|
||||
Set<VCFHeaderLine> headerLines = new HashSet<VCFHeaderLine>();
|
||||
if ( SET_KEY != null )
|
||||
headerLines.add(new VCFInfoHeaderLine(SET_KEY, 1, VCFHeaderLineType.String, "Source VCF for the merged record"));
|
||||
if ( STATUS_KEY != null )
|
||||
headerLines.add(new VCFInfoHeaderLine(STATUS_KEY, 1, VCFHeaderLineType.String, "Extent to which records match"));
|
||||
final VCFHeader vcfHeader = new VCFHeader(headerLines, Collections.<String>emptySet());
|
||||
baseWriter.writeHeader(vcfHeader);
|
||||
writer = VariantContextWriterFactory.sortOnTheFly(baseWriter, ACTIVE_WINDOW);
|
||||
}
|
||||
|
||||
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) {
|
||||
if ( tracker == null )
|
||||
return 0;
|
||||
|
||||
final Collection<VariantContext> VCs = tracker.getValues(variants, context.getLocation());
|
||||
if ( VCs.size() == 0 )
|
||||
return 0;
|
||||
|
||||
final VCcontext vc = new VCcontext(VariantContextUtils.sitesOnlyVariantContexts(VCs), ref);
|
||||
|
||||
// TODO -- what should we do about filtered records?
|
||||
|
||||
if ( !queue.isEmpty() ) {
|
||||
|
||||
final VCcontext previous = queue.getLast();
|
||||
if ( !previous.loc.onSameContig(vc.loc) ||
|
||||
previous.loc.distance(vc.loc) > MAX_DISTANCE_BETWEEN_MERGED_RECORDS ||
|
||||
queue.getFirst().loc.distance(vc.loc) > MAX_HAPLOTYPE_TO_CONSIDER ) {
|
||||
purgeQueue();
|
||||
}
|
||||
}
|
||||
|
||||
queue.addLast(vc);
|
||||
return 0;
|
||||
}
|
||||
|
||||
public Integer reduceInit() { return 0; }
|
||||
|
||||
public Integer reduce(Integer value, Integer sum) {
|
||||
return sum + value;
|
||||
}
|
||||
|
||||
public void onTraversalDone(Integer result) {
|
||||
if ( !queue.isEmpty() )
|
||||
purgeQueue();
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private void purgeQueue() {
|
||||
|
||||
final ReferenceContext refContext = queue.getFirst().ref;
|
||||
|
||||
// divide them up by source
|
||||
while ( !queue.isEmpty() ) {
|
||||
VCcontext context = queue.removeFirst();
|
||||
for ( final VariantContext vc: context.vcs ) {
|
||||
if ( vc.getSource().equals(source1) )
|
||||
sourceVCs1.add(vc);
|
||||
else
|
||||
sourceVCs2.add(vc);
|
||||
}
|
||||
}
|
||||
|
||||
writeAndPurgeAllEqualVariants(sourceVCs1, sourceVCs2, SAME_STATUS);
|
||||
|
||||
if ( sourceVCs1.isEmpty() ) {
|
||||
writeAll(sourceVCs2, source2, null);
|
||||
} else if ( sourceVCs2.isEmpty() ) {
|
||||
writeAll(sourceVCs1, source1, null);
|
||||
} else {
|
||||
resolveByHaplotype(refContext);
|
||||
}
|
||||
|
||||
// allow for GC of the data
|
||||
sourceVCs1.clear();
|
||||
sourceVCs2.clear();
|
||||
}
|
||||
|
||||
private void writeAll(final List<VariantContext> sourceVCs, final String set, final String status) {
|
||||
for ( final VariantContext vc : sourceVCs ) {
|
||||
writeOne(vc, set, status);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeOne(final VariantContext vc, final String set, final String status) {
|
||||
final Map<String, Object> attrs = new HashMap<String, Object>(vc.getAttributes());
|
||||
if ( SET_KEY != null && set != null )
|
||||
attrs.put(SET_KEY, set);
|
||||
if ( STATUS_KEY != null && status != null )
|
||||
attrs.put(STATUS_KEY, status);
|
||||
writer.add(new VariantContextBuilder(vc).attributes(attrs).make());
|
||||
}
|
||||
|
||||
private void writeAndPurgeAllEqualVariants(final List<VariantContext> sourceVCs1, final List<VariantContext> sourceVCs2, final String status) {
|
||||
|
||||
int currentIndex1 = 0, currentIndex2 = 0;
|
||||
int size1 = sourceVCs1.size(), size2 = sourceVCs2.size();
|
||||
VariantContext current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null);
|
||||
VariantContext current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null);
|
||||
|
||||
while ( current1 != null && current2 != null ) {
|
||||
|
||||
final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1);
|
||||
final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2);
|
||||
|
||||
if ( loc1.equals(loc2) ||
|
||||
(loc1.getStart() == loc2.getStart() && (current1.getAlternateAlleles().size() > 1 || current2.getAlternateAlleles().size() > 1)) ) {
|
||||
// test the alleles
|
||||
if ( determineAndWriteOverlap(current1, current2, status) ) {
|
||||
sourceVCs1.remove(currentIndex1);
|
||||
sourceVCs2.remove(currentIndex2);
|
||||
size1--;
|
||||
size2--;
|
||||
} else {
|
||||
currentIndex1++;
|
||||
currentIndex2++;
|
||||
}
|
||||
current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null);
|
||||
current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null);
|
||||
} else if ( loc1.isBefore(loc2) ) {
|
||||
currentIndex1++;
|
||||
current1 = (currentIndex1 < size1 ? sourceVCs1.get(currentIndex1): null);
|
||||
} else {
|
||||
currentIndex2++;
|
||||
current2 = (currentIndex2 < size2 ? sourceVCs2.get(currentIndex2): null);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean determineAndWriteOverlap(final VariantContext vc1, final VariantContext vc2, final String status) {
|
||||
final int allelesFrom1In2 = findOverlap(vc1, vc2);
|
||||
final int allelesFrom2In1 = findOverlap(vc2, vc1);
|
||||
final int totalAllelesIn1 = vc1.getAlternateAlleles().size();
|
||||
final int totalAllelesIn2 = vc2.getAlternateAlleles().size();
|
||||
|
||||
final boolean allAllelesFrom1Overlap = allelesFrom1In2 == totalAllelesIn1;
|
||||
final boolean allAllelesFrom2Overlap = allelesFrom2In1 == totalAllelesIn2;
|
||||
|
||||
boolean thereIsOverlap = true;
|
||||
|
||||
if ( allAllelesFrom1Overlap && allAllelesFrom2Overlap ) {
|
||||
writeOne(vc1, INTERSECTION_SET, status);
|
||||
} else if ( allAllelesFrom1Overlap ) {
|
||||
writeOne(vc2, INTERSECTION_SET, source1 + "IsSubsetOf" + source2);
|
||||
} else if ( allAllelesFrom2Overlap ) {
|
||||
writeOne(vc1, INTERSECTION_SET, source2 + "IsSubsetOf" + source1);
|
||||
} else if ( allelesFrom1In2 > 0 ) {
|
||||
writeOne(vc1, INTERSECTION_SET, SOME_ALLELES_MATCH_STATUS);
|
||||
} else if ( totalAllelesIn1 > 1 || totalAllelesIn2 > 1 ) { // we don't handle multi-allelics in the haplotype-based reconstruction
|
||||
writeOne(vc1, INTERSECTION_SET, SAME_START_DIFFERENT_ALLELES_STATUS);
|
||||
} else {
|
||||
thereIsOverlap = false;
|
||||
}
|
||||
|
||||
return thereIsOverlap;
|
||||
}
|
||||
|
||||
private static int findOverlap(final VariantContext target, final VariantContext comparison) {
|
||||
int overlap = 0;
|
||||
for ( final Allele allele : target.getAlternateAlleles() ) {
|
||||
if ( comparison.hasAlternateAllele(allele) )
|
||||
overlap++;
|
||||
}
|
||||
return overlap;
|
||||
}
|
||||
|
||||
private static final double SW_MATCH = 4.0;
|
||||
private static final double SW_MISMATCH = -10.0;
|
||||
private static final double SW_GAP = -25.0;
|
||||
private static final double SW_GAP_EXTEND = -1.3;
|
||||
private void resolveByHaplotype(final ReferenceContext refContext) {
|
||||
|
||||
final byte[] source1Haplotype = generateHaplotype(sourceVCs1, refContext);
|
||||
final byte[] source2Haplotype = generateHaplotype(sourceVCs2, refContext);
|
||||
|
||||
final SWPairwiseAlignment swConsensus1 = new SWPairwiseAlignment( refContext.getBases(), source1Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( refContext.getBases(), source2Haplotype, SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
|
||||
// protect against SW failures
|
||||
if( swConsensus1.getCigar().toString().contains("S") || swConsensus1.getCigar().getReferenceLength() < 20 ||
|
||||
swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() < 20 ) {
|
||||
// TODO -- handle errors appropriately
|
||||
logger.debug("Bad SW alignment; aborting at " + refContext.getLocus());
|
||||
return;
|
||||
}
|
||||
|
||||
// order results by start position
|
||||
final TreeMap<Integer, VariantContext> source1Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(0, swConsensus1.getCigar(), refContext.getBases(), source1Haplotype, refContext.getWindow(), source1, 0));
|
||||
final TreeMap<Integer, VariantContext> source2Map = new TreeMap<Integer, VariantContext>(GenotypingEngine.generateVCsFromAlignment(0, swConsensus2.getCigar(), refContext.getBases(), source2Haplotype, refContext.getWindow(), source2, 0));
|
||||
if ( source1Map.size() == 0 || source2Map.size() == 0 ) {
|
||||
// TODO -- handle errors appropriately
|
||||
logger.debug("No source alleles; aborting at " + refContext.getLocus());
|
||||
return;
|
||||
}
|
||||
|
||||
// create lists and test for equality
|
||||
final List<VariantContext> source1Alleles = new ArrayList<VariantContext>(source1Map.values());
|
||||
final List<VariantContext> source2Alleles = new ArrayList<VariantContext>(source2Map.values());
|
||||
|
||||
writeAndPurgeAllEqualVariants(source1Alleles, source2Alleles, SAME_BY_HAPLOTYPE_STATUS);
|
||||
if ( source1Alleles.isEmpty() ) {
|
||||
writeAll(source2Alleles, source2, null);
|
||||
} else if ( source2Alleles.isEmpty() ) {
|
||||
writeAll(source1Alleles, source1, null);
|
||||
} else {
|
||||
writeDifferences(source1Alleles, source2Alleles);
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] generateHaplotype(final List<VariantContext> sourceVCs, final ReferenceContext refContext) {
|
||||
|
||||
final StringBuilder sb = new StringBuilder();
|
||||
|
||||
final int startPos = refContext.getWindow().getStart();
|
||||
int currentPos = startPos;
|
||||
final byte[] reference = refContext.getBases();
|
||||
|
||||
for ( final VariantContext vc : sourceVCs ) {
|
||||
// add any missing reference context
|
||||
int vcStart = vc.getStart();
|
||||
final int refAlleleLength = vc.getReference().length();
|
||||
if ( refAlleleLength == vc.getEnd() - vc.getStart() ) // this is a deletion (whereas for other events the padding base isn't part of the position)
|
||||
vcStart++;
|
||||
|
||||
while ( currentPos < vcStart )
|
||||
sb.append((char)reference[currentPos++ - startPos]);
|
||||
|
||||
// add the alt allele
|
||||
sb.append(vc.getAlternateAllele(0).getBaseString());
|
||||
|
||||
// skip the reference allele
|
||||
currentPos += refAlleleLength;
|
||||
}
|
||||
// add any missing reference context
|
||||
final int stopPos = refContext.getWindow().getStop();
|
||||
while ( currentPos < stopPos )
|
||||
sb.append((char)reference[currentPos++ - startPos]);
|
||||
|
||||
return sb.toString().getBytes();
|
||||
}
|
||||
|
||||
private void writeDifferences(final List<VariantContext> source1Alleles, final List<VariantContext> source2Alleles) {
|
||||
int currentIndex1 = 0, currentIndex2 = 0;
|
||||
final int size1 = source1Alleles.size(), size2 = source2Alleles.size();
|
||||
VariantContext current1 = source1Alleles.get(0);
|
||||
VariantContext current2 = source2Alleles.get(0);
|
||||
|
||||
while ( currentIndex1 < size1 || currentIndex2 < size2 ) {
|
||||
if ( current1 == null ) {
|
||||
writeOne(current2, source2, null);
|
||||
currentIndex2++;
|
||||
current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null);
|
||||
} else if ( current2 == null ) {
|
||||
writeOne(current1, source1, null);
|
||||
currentIndex1++;
|
||||
current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null);
|
||||
} else {
|
||||
|
||||
final GenomeLoc loc1 = getToolkit().getGenomeLocParser().createGenomeLoc(current1);
|
||||
final GenomeLoc loc2 = getToolkit().getGenomeLocParser().createGenomeLoc(current2);
|
||||
|
||||
if ( loc1.getStart() == loc2.getStart() || loc1.overlapsP(loc2) ) {
|
||||
String status;
|
||||
if ( loc1.getStart() == loc2.getStart() ) {
|
||||
final String allele1 = current1.getAlternateAllele(0).getBaseString();
|
||||
final String allele2 = current2.getAlternateAllele(0).getBaseString();
|
||||
if ( allele1.indexOf(allele2) != -1 || allele2.indexOf(allele1) != -1 )
|
||||
status = ONE_ALLELE_SUBSET_OF_OTHER_STATUS;
|
||||
else
|
||||
status = SAME_START_DIFFERENT_ALLELES_STATUS;
|
||||
} else {
|
||||
status = OVERLAPPING_EVENTS_STATUS;
|
||||
}
|
||||
|
||||
writeOne(current1, INTERSECTION_SET, status);
|
||||
currentIndex1++;
|
||||
currentIndex2++;
|
||||
current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null);
|
||||
current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null);
|
||||
} else if ( loc1.isBefore(loc2) ) {
|
||||
writeOne(current1, source1, null);
|
||||
currentIndex1++;
|
||||
current1 = (currentIndex1 < size1 ? source1Alleles.get(currentIndex1): null);
|
||||
} else {
|
||||
writeOne(current2, source2, null);
|
||||
currentIndex2++;
|
||||
current2 = (currentIndex2 < size2 ? source2Alleles.get(currentIndex2): null);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,149 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 23, 2011
|
||||
*/
|
||||
// Class for finding the K best paths (as determined by the sum of multiplicities of the edges) in a graph.
|
||||
// This is different from most graph traversals because we want to test paths from any source node to any sink node.
|
||||
public class KBestPaths {
|
||||
|
||||
// static access only
|
||||
protected KBestPaths() { }
|
||||
private static int MAX_PATHS_TO_HOLD = 100;
|
||||
|
||||
protected static class MyInt { public int val = 0; }
|
||||
|
||||
// class to keep track of paths
|
||||
protected static class Path {
|
||||
|
||||
// the last vertex seen in the path
|
||||
private DeBruijnVertex lastVertex;
|
||||
|
||||
// the list of edges comprising the path
|
||||
private ArrayList<DeBruijnEdge> edges;
|
||||
|
||||
// the scores for the path
|
||||
private int totalScore = 0, lowestEdge = -1;
|
||||
|
||||
public Path( final DeBruijnVertex initialVertex ) {
|
||||
lastVertex = initialVertex;
|
||||
edges = new ArrayList<DeBruijnEdge>(0);
|
||||
}
|
||||
|
||||
public Path( final Path p, final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge ) {
|
||||
lastVertex = graph.getEdgeTarget(edge);
|
||||
edges = new ArrayList<DeBruijnEdge>(p.edges);
|
||||
edges.add(edge);
|
||||
totalScore = p.totalScore + edge.getMultiplicity();
|
||||
lowestEdge = ( p.lowestEdge == -1 ) ? edge.getMultiplicity() : Math.min(p.lowestEdge, edge.getMultiplicity());
|
||||
}
|
||||
|
||||
public boolean containsEdge( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final DeBruijnEdge edge ) {
|
||||
final DeBruijnVertex targetVertex = graph.getEdgeTarget(edge);
|
||||
for( final DeBruijnEdge e : edges ) {
|
||||
if( e.equals(graph, edge) || graph.getEdgeTarget(e).equals(targetVertex) ) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public ArrayList<DeBruijnEdge> getEdges() { return edges; }
|
||||
|
||||
public int getScore() { return totalScore; }
|
||||
|
||||
public int getLowestEdge() { return lowestEdge; }
|
||||
|
||||
public DeBruijnVertex getLastVertexInPath() { return lastVertex; }
|
||||
|
||||
public byte[] getBases( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
|
||||
if( edges.size() == 0 ) { return lastVertex.getSequence(); }
|
||||
|
||||
byte[] bases = graph.getEdgeSource( edges.get(0) ).getSequence();
|
||||
for( final DeBruijnEdge e : edges ) {
|
||||
bases = ArrayUtils.addAll(bases, graph.getEdgeTarget( e ).getSuffix());
|
||||
}
|
||||
return bases;
|
||||
}
|
||||
}
|
||||
|
||||
protected static class PathComparatorTotalScore implements Comparator<Path> {
|
||||
public int compare(final Path path1, final Path path2) {
|
||||
return path1.totalScore - path2.totalScore;
|
||||
}
|
||||
}
|
||||
|
||||
protected static class PathComparatorLowestEdge implements Comparator<Path> {
|
||||
public int compare(final Path path1, final Path path2) {
|
||||
return path2.lowestEdge - path1.lowestEdge;
|
||||
}
|
||||
}
|
||||
|
||||
public static List<Path> getKBestPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final int k ) {
|
||||
if( k > MAX_PATHS_TO_HOLD/2 ) { throw new ReviewedStingException("Asked for more paths than MAX_PATHS_TO_HOLD!"); }
|
||||
final ArrayList<Path> bestPaths = new ArrayList<Path>();
|
||||
|
||||
// run a DFS for best paths
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if( graph.inDegreeOf(v) == 0 ) {
|
||||
findBestPaths(graph, new Path(v), bestPaths);
|
||||
}
|
||||
}
|
||||
|
||||
Collections.sort(bestPaths, new PathComparatorLowestEdge() );
|
||||
Collections.reverse(bestPaths);
|
||||
return bestPaths.subList(0, Math.min(k, bestPaths.size()));
|
||||
}
|
||||
|
||||
private static void findBestPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Path path, final List<Path> bestPaths ) {
|
||||
findBestPaths(graph, path, bestPaths, new MyInt());
|
||||
}
|
||||
|
||||
private static void findBestPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Path path, final List<Path> bestPaths, MyInt n ) {
|
||||
|
||||
// did we hit the end of a path?
|
||||
if ( allOutgoingEdgesHaveBeenVisited(graph, path) ) {
|
||||
if ( bestPaths.size() >= MAX_PATHS_TO_HOLD ) {
|
||||
// clean out some low scoring paths
|
||||
Collections.sort(bestPaths, new PathComparatorLowestEdge() );
|
||||
for(int iii = 0; iii < 20; iii++) { bestPaths.remove(0); }
|
||||
}
|
||||
bestPaths.add(path);
|
||||
} else if( n.val > 10000) {
|
||||
// do nothing, just return
|
||||
} else {
|
||||
// recursively run DFS
|
||||
final ArrayList<DeBruijnEdge> edgeArrayList = new ArrayList<DeBruijnEdge>();
|
||||
edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex));
|
||||
Collections.sort(edgeArrayList);
|
||||
Collections.reverse(edgeArrayList);
|
||||
for ( final DeBruijnEdge edge : edgeArrayList ) {
|
||||
// make sure the edge is not already in the path
|
||||
if ( path.containsEdge(graph, edge) )
|
||||
continue;
|
||||
|
||||
final Path newPath = new Path(path, graph, edge);
|
||||
n.val++;
|
||||
findBestPaths(graph, newPath, bestPaths, n);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean allOutgoingEdgesHaveBeenVisited( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final Path path ) {
|
||||
for( final DeBruijnEdge edge : graph.outgoingEdgesOf(path.lastVertex) ) {
|
||||
if( !path.containsEdge(graph, edge) ) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,403 @@
|
|||
/*
|
||||
* Copyright (c) 2011 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
public class LikelihoodCalculationEngine {
|
||||
|
||||
private static final double LOG_ONE_HALF = -Math.log10(2.0);
|
||||
private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
|
||||
private final byte constantGCP;
|
||||
private final boolean DEBUG;
|
||||
private final PairHMM pairHMM;
|
||||
|
||||
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) {
|
||||
pairHMM = new PairHMM( noBanded );
|
||||
this.constantGCP = constantGCP;
|
||||
DEBUG = debug;
|
||||
}
|
||||
|
||||
public void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList ) {
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
|
||||
int X_METRIC_LENGTH = 0;
|
||||
for( final String sample : perSampleReadList.keySet() ) {
|
||||
for( final GATKSAMRecord read : perSampleReadList.get(sample) ) {
|
||||
final int readLength = read.getReadLength();
|
||||
if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; }
|
||||
}
|
||||
}
|
||||
int Y_METRIC_LENGTH = 0;
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
final int haplotypeLength = haplotypes.get(jjj).getBases().length;
|
||||
if( haplotypeLength > Y_METRIC_LENGTH ) { Y_METRIC_LENGTH = haplotypeLength; }
|
||||
}
|
||||
|
||||
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
|
||||
X_METRIC_LENGTH += 2;
|
||||
Y_METRIC_LENGTH += 2;
|
||||
|
||||
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases
|
||||
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
|
||||
|
||||
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
|
||||
|
||||
// for each sample's reads
|
||||
for( final String sample : perSampleReadList.keySet() ) {
|
||||
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
|
||||
// evaluate the likelihood of the reads given those haplotypes
|
||||
computeReadLikelihoods( haplotypes, perSampleReadList.get(sample), sample, matchMetricArray, XMetricArray, YMetricArray );
|
||||
}
|
||||
}
|
||||
|
||||
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample,
|
||||
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final int numReads = reads.size();
|
||||
final double[][] readLikelihoods = new double[numHaplotypes][numReads];
|
||||
for( int iii = 0; iii < numReads; iii++ ) {
|
||||
final GATKSAMRecord read = reads.get(iii);
|
||||
|
||||
final byte[] overallGCP = new byte[read.getReadLength()];
|
||||
Arrays.fill( overallGCP, constantGCP ); // Is there a way to derive empirical estimates for this from the data?
|
||||
Haplotype previousHaplotypeSeen = null;
|
||||
final byte[] readQuals = read.getBaseQualities();
|
||||
final byte[] readInsQuals = read.getBaseInsertionQualities();
|
||||
final byte[] readDelQuals = read.getBaseDeletionQualities();
|
||||
for( int kkk = 0; kkk < readQuals.length; kkk++ ) {
|
||||
readQuals[kkk] = ( readQuals[kkk] > (byte) read.getMappingQuality() ? (byte) read.getMappingQuality() : readQuals[kkk] ); // cap base quality by mapping quality
|
||||
//readQuals[kkk] = ( readQuals[kkk] > readInsQuals[kkk] ? readInsQuals[kkk] : readQuals[kkk] ); // cap base quality by base insertion quality, needs to be evaluated
|
||||
//readQuals[kkk] = ( readQuals[kkk] > readDelQuals[kkk] ? readDelQuals[kkk] : readQuals[kkk] ); // cap base quality by base deletion quality, needs to be evaluated
|
||||
readQuals[kkk] = ( readQuals[kkk] < (byte) 17 ? QualityUtils.MIN_USABLE_Q_SCORE : readQuals[kkk] );
|
||||
}
|
||||
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
final Haplotype haplotype = haplotypes.get(jjj);
|
||||
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
|
||||
previousHaplotypeSeen = haplotype;
|
||||
|
||||
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(),
|
||||
readQuals, readInsQuals, readDelQuals, overallGCP,
|
||||
haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
|
||||
}
|
||||
}
|
||||
for( int jjj = 0; jjj < numHaplotypes; jjj++ ) {
|
||||
haplotypes.get(jjj).addReadLikelihoods( sample, readLikelihoods[jjj] );
|
||||
}
|
||||
}
|
||||
|
||||
private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
|
||||
for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){
|
||||
if( b1[iii] != b2[iii] ) {
|
||||
return iii;
|
||||
}
|
||||
}
|
||||
return b1.length;
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final String sample ) {
|
||||
// set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.add(list);
|
||||
}
|
||||
return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping );
|
||||
}
|
||||
|
||||
@Requires({"haplotypeMapping.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList<ArrayList<Haplotype>> haplotypeMapping ) {
|
||||
|
||||
final int numHaplotypes = haplotypeMapping.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
|
||||
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
|
||||
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
|
||||
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
}
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"})
|
||||
public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList<Haplotype> haplotypes, final Set<String> samples ) {
|
||||
// set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization?
|
||||
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
|
||||
for( final Haplotype h : haplotypes ) {
|
||||
final ArrayList<Haplotype> list = new ArrayList<Haplotype>();
|
||||
list.add(h);
|
||||
haplotypeMapping.add(list);
|
||||
}
|
||||
|
||||
final int numHaplotypes = haplotypeMapping.size();
|
||||
final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes];
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY);
|
||||
}
|
||||
|
||||
// compute the diploid haplotype likelihoods
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) {
|
||||
for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) {
|
||||
double haplotypeLikelihood = 0.0;
|
||||
for( final String sample : samples ) {
|
||||
final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample);
|
||||
final double[] readLikelihoods_jjj = jjj_mapped.getReadLikelihoods(sample);
|
||||
for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) {
|
||||
// Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2)
|
||||
// First term is approximated by Jacobian log with table lookup.
|
||||
haplotypeLikelihood += MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF;
|
||||
}
|
||||
}
|
||||
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum?
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// normalize the diploid likelihoods matrix
|
||||
return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix );
|
||||
}
|
||||
|
||||
@Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"})
|
||||
@Ensures({"result.length == result[0].length", "result.length == likelihoodMatrix.length"})
|
||||
protected static double[][] normalizeDiploidLikelihoodMatrixFromLog10( final double[][] likelihoodMatrix ) {
|
||||
final int numHaplotypes = likelihoodMatrix.length;
|
||||
double[] genotypeLikelihoods = new double[numHaplotypes*(numHaplotypes+1)/2];
|
||||
int index = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ){
|
||||
genotypeLikelihoods[index++] = likelihoodMatrix[iii][jjj];
|
||||
}
|
||||
}
|
||||
genotypeLikelihoods = MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true);
|
||||
index = 0;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ){
|
||||
likelihoodMatrix[iii][jjj] = genotypeLikelihoods[index++];
|
||||
}
|
||||
}
|
||||
return likelihoodMatrix;
|
||||
}
|
||||
|
||||
/*
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.size() <= haplotypes.size()"})
|
||||
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes ) {
|
||||
|
||||
// BUGBUG: This function needs a lot of work. Need to use 4-gamete test or Tajima's D to decide to break up events into separate pieces for genotyping
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
|
||||
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
|
||||
final double[][][] haplotypeLikelihoodMatrix = new double[sampleKeySet.size()][numHaplotypes][numHaplotypes];
|
||||
|
||||
int sampleCount = 0;
|
||||
for( final String sample : sampleKeySet ) {
|
||||
haplotypeLikelihoodMatrix[sampleCount++] = computeDiploidHaplotypeLikelihoods( haplotypes, sample );
|
||||
}
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
int chosenSample = 0;
|
||||
//double bestElement = Double.NEGATIVE_INFINITY;
|
||||
final int maxChosenHaplotypes = Math.min( 15, sampleKeySet.size() * 2 + 1 );
|
||||
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for( int kkk = 0; kkk < sampleCount; kkk++ ) {
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
if( haplotypeLikelihoodMatrix[kkk][iii][jjj] > maxElement ) {
|
||||
maxElement = haplotypeLikelihoodMatrix[kkk][iii][jjj];
|
||||
hap1 = iii;
|
||||
hap2 = jjj;
|
||||
chosenSample = kkk;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
|
||||
|
||||
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
|
||||
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
|
||||
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
haplotypeLikelihoodMatrix[chosenSample][iii][jjj] = Double.NEGATIVE_INFINITY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
|
||||
|
||||
final ArrayList<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
|
||||
for( final int hIndex : bestHaplotypesIndexList ) {
|
||||
bestHaplotypes.add( haplotypes.get(hIndex) );
|
||||
}
|
||||
return bestHaplotypes;
|
||||
}
|
||||
*/
|
||||
|
||||
@Requires({"haplotypes.size() > 0"})
|
||||
@Ensures({"result.size() <= haplotypes.size()"})
|
||||
public ArrayList<Haplotype> selectBestHaplotypes( final ArrayList<Haplotype> haplotypes ) {
|
||||
|
||||
final int numHaplotypes = haplotypes.size();
|
||||
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
|
||||
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
|
||||
bestHaplotypesIndexList.add(0); // always start with the reference haplotype
|
||||
final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( haplotypes, sampleKeySet ); // all samples pooled together
|
||||
|
||||
int hap1 = 0;
|
||||
int hap2 = 0;
|
||||
//double bestElement = Double.NEGATIVE_INFINITY;
|
||||
final int maxChosenHaplotypes = Math.min( 8, sampleKeySet.size() * 2 + 1 );
|
||||
while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) {
|
||||
double maxElement = Double.NEGATIVE_INFINITY;
|
||||
for( int iii = 0; iii < numHaplotypes; iii++ ) {
|
||||
for( int jjj = 0; jjj <= iii; jjj++ ) {
|
||||
if( haplotypeLikelihoodMatrix[iii][jjj] > maxElement ) {
|
||||
maxElement = haplotypeLikelihoodMatrix[iii][jjj];
|
||||
hap1 = iii;
|
||||
hap2 = jjj;
|
||||
}
|
||||
}
|
||||
}
|
||||
if( maxElement == Double.NEGATIVE_INFINITY ) { break; }
|
||||
if( DEBUG ) { System.out.println("Chose haplotypes " + hap1 + " and " + hap2 + " with diploid likelihood = " + haplotypeLikelihoodMatrix[hap1][hap2]); }
|
||||
haplotypeLikelihoodMatrix[hap1][hap2] = Double.NEGATIVE_INFINITY;
|
||||
|
||||
if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); }
|
||||
if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); }
|
||||
}
|
||||
|
||||
if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); }
|
||||
|
||||
final ArrayList<Haplotype> bestHaplotypes = new ArrayList<Haplotype>();
|
||||
for( final int hIndex : bestHaplotypesIndexList ) {
|
||||
bestHaplotypes.add( haplotypes.get(hIndex) );
|
||||
}
|
||||
return bestHaplotypes;
|
||||
}
|
||||
|
||||
public static Map<String, Map<Allele, List<GATKSAMRecord>>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList, final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) {
|
||||
final Map<String, Map<Allele, List<GATKSAMRecord>>> returnMap = new HashMap<String, Map<Allele, List<GATKSAMRecord>>>();
|
||||
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
|
||||
for( final String sample : perSampleReadList.keySet() ) {
|
||||
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>();
|
||||
final ArrayList<GATKSAMRecord> readsForThisSample = perSampleReadList.get(sample);
|
||||
for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
|
||||
final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
final double likelihoods[] = new double[call.getFirst().getAlleles().size()];
|
||||
int count = 0;
|
||||
for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood
|
||||
double maxLikelihood = Double.NEGATIVE_INFINITY;
|
||||
for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
|
||||
final double likelihood = h.getReadLikelihoods(sample)[iii];
|
||||
if( likelihood > maxLikelihood ) {
|
||||
maxLikelihood = likelihood;
|
||||
}
|
||||
}
|
||||
likelihoods[count++] = maxLikelihood;
|
||||
}
|
||||
final int bestAllele = MathUtils.maxElementIndex(likelihoods);
|
||||
final double bestLikelihood = likelihoods[bestAllele];
|
||||
Allele allele = Allele.NO_CALL;
|
||||
boolean isInformativeRead = false;
|
||||
for( final double likelihood : likelihoods ) {
|
||||
if( bestLikelihood - likelihood > BEST_LIKELIHOOD_THRESHOLD ) {
|
||||
isInformativeRead = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// uninformative reads get the no call Allele
|
||||
if( isInformativeRead ) {
|
||||
allele = call.getFirst().getAlleles().get(bestAllele);
|
||||
}
|
||||
List<GATKSAMRecord> readList = alleleReadMap.get(allele);
|
||||
if( readList == null ) {
|
||||
readList = new ArrayList<GATKSAMRecord>();
|
||||
alleleReadMap.put(allele, readList);
|
||||
}
|
||||
readList.add(read);
|
||||
}
|
||||
}
|
||||
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods
|
||||
List<GATKSAMRecord> readList = alleleReadMap.get(Allele.NO_CALL);
|
||||
if( readList == null ) {
|
||||
readList = new ArrayList<GATKSAMRecord>();
|
||||
alleleReadMap.put(Allele.NO_CALL, readList);
|
||||
}
|
||||
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample) ) {
|
||||
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all
|
||||
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
|
||||
readList.add(read);
|
||||
}
|
||||
}
|
||||
returnMap.put(sample, alleleReadMap);
|
||||
}
|
||||
return returnMap;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks
|
||||
* Date: Mar 14, 2011
|
||||
*/
|
||||
public abstract class LocalAssemblyEngine {
|
||||
|
||||
public enum ASSEMBLER {
|
||||
SIMPLE_DE_BRUIJN
|
||||
}
|
||||
|
||||
protected LocalAssemblyEngine() {
|
||||
}
|
||||
|
||||
public abstract ArrayList<Haplotype> runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, ArrayList<VariantContext> activeAllelesToGenotype);
|
||||
}
|
||||
|
|
@ -0,0 +1,372 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import com.google.java.contract.Ensures;
|
||||
import org.apache.commons.lang.ArrayUtils;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.SWPairwiseAlignment;
|
||||
import org.broadinstitute.sting.utils.activeregion.ActiveRegion;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.sam.AlignmentUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: ebanks, rpoplin
|
||||
* Date: Mar 14, 2011
|
||||
*/
|
||||
|
||||
public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
|
||||
|
||||
private static final int KMER_OVERLAP = 5; // the additional size of a valid chunk of sequence, used to string together k-mers
|
||||
private static final int NUM_BEST_PATHS_PER_KMER_GRAPH = 11;
|
||||
private static final byte MIN_QUALITY = (byte) 17;
|
||||
|
||||
// Smith-Waterman parameters originally copied from IndelRealigner
|
||||
private static final double SW_MATCH = 5.0; // 1.0;
|
||||
private static final double SW_MISMATCH = -10.0; //-1.0/3.0;
|
||||
private static final double SW_GAP = -22.0; //-1.0-1.0/3.0;
|
||||
private static final double SW_GAP_EXTEND = -1.2; //-1.0/.0;
|
||||
|
||||
private final boolean DEBUG;
|
||||
private final PrintStream GRAPH_WRITER;
|
||||
private final ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>> graphs = new ArrayList<DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>>();
|
||||
|
||||
private int PRUNE_FACTOR = 1;
|
||||
|
||||
public SimpleDeBruijnAssembler( final boolean debug, final PrintStream graphWriter ) {
|
||||
super();
|
||||
DEBUG = debug;
|
||||
GRAPH_WRITER = graphWriter;
|
||||
}
|
||||
|
||||
public ArrayList<Haplotype> runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final ArrayList<VariantContext> activeAllelesToGenotype ) {
|
||||
this.PRUNE_FACTOR = PRUNE_FACTOR;
|
||||
|
||||
// create the graphs
|
||||
createDeBruijnGraphs( activeRegion.getReads(), refHaplotype );
|
||||
|
||||
// clean up the graphs by pruning and merging
|
||||
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
|
||||
pruneGraph( graph, PRUNE_FACTOR );
|
||||
//eliminateNonRefPaths( graph );
|
||||
mergeNodes( graph );
|
||||
}
|
||||
|
||||
if( GRAPH_WRITER != null ) {
|
||||
printGraphs();
|
||||
}
|
||||
|
||||
// find the best paths in the graphs
|
||||
return findBestPaths( refHaplotype, fullReferenceWithPadding, refLoc, activeAllelesToGenotype, activeRegion.getExtendedLoc() );
|
||||
}
|
||||
|
||||
private void createDeBruijnGraphs( final ArrayList<GATKSAMRecord> reads, final Haplotype refHaplotype ) {
|
||||
graphs.clear();
|
||||
|
||||
// create the graph
|
||||
for( int kmer = 31; kmer <= 75; kmer += 6 ) {
|
||||
final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
if( createGraphFromSequences( graph, reads, kmer, refHaplotype, DEBUG ) ) {
|
||||
graphs.add(graph);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static void mergeNodes( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
|
||||
boolean foundNodesToMerge = true;
|
||||
while( foundNodesToMerge ) {
|
||||
foundNodesToMerge = false;
|
||||
for( final DeBruijnEdge e : graph.edgeSet() ) {
|
||||
final DeBruijnVertex outgoingVertex = graph.getEdgeTarget(e);
|
||||
final DeBruijnVertex incomingVertex = graph.getEdgeSource(e);
|
||||
if( !outgoingVertex.equals(incomingVertex) && graph.inDegreeOf(outgoingVertex) == 1 && graph.outDegreeOf(incomingVertex) == 1) {
|
||||
final Set<DeBruijnEdge> outEdges = graph.outgoingEdgesOf(outgoingVertex);
|
||||
final Set<DeBruijnEdge> inEdges = graph.incomingEdgesOf(incomingVertex);
|
||||
if( inEdges.size() == 1 && outEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() / 2 ) );
|
||||
} else if( inEdges.size() == 1 ) {
|
||||
inEdges.iterator().next().setMultiplicity( inEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
} else if( outEdges.size() == 1 ) {
|
||||
outEdges.iterator().next().setMultiplicity( outEdges.iterator().next().getMultiplicity() + ( e.getMultiplicity() - 1 ) );
|
||||
}
|
||||
|
||||
final DeBruijnVertex addedVertex = new DeBruijnVertex( ArrayUtils.addAll(incomingVertex.getSequence(), outgoingVertex.getSuffix()), outgoingVertex.kmer );
|
||||
graph.addVertex(addedVertex);
|
||||
for( final DeBruijnEdge edge : outEdges ) {
|
||||
graph.addEdge(addedVertex, graph.getEdgeTarget(edge), new DeBruijnEdge(edge.getIsRef(), edge.getMultiplicity()));
|
||||
}
|
||||
for( final DeBruijnEdge edge : inEdges ) {
|
||||
graph.addEdge(graph.getEdgeSource(edge), addedVertex, new DeBruijnEdge(edge.getIsRef(), edge.getMultiplicity()));
|
||||
}
|
||||
|
||||
graph.removeVertex( incomingVertex );
|
||||
graph.removeVertex( outgoingVertex );
|
||||
foundNodesToMerge = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static void pruneGraph( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final int pruneFactor ) {
|
||||
final ArrayList<DeBruijnEdge> edgesToRemove = new ArrayList<DeBruijnEdge>();
|
||||
for( final DeBruijnEdge e : graph.edgeSet() ) {
|
||||
if( e.getMultiplicity() <= pruneFactor && !e.getIsRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor
|
||||
edgesToRemove.add(e);
|
||||
}
|
||||
}
|
||||
graph.removeAllEdges(edgesToRemove);
|
||||
|
||||
// Run through the graph and clean up singular orphaned nodes
|
||||
final ArrayList<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) {
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
graph.removeAllVertices(verticesToRemove);
|
||||
}
|
||||
|
||||
protected static void eliminateNonRefPaths( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph ) {
|
||||
final ArrayList<DeBruijnVertex> verticesToRemove = new ArrayList<DeBruijnVertex>();
|
||||
boolean done = false;
|
||||
while( !done ) {
|
||||
done = true;
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
if( graph.inDegreeOf(v) == 0 || graph.outDegreeOf(v) == 0 ) {
|
||||
boolean isRefNode = false;
|
||||
for( final DeBruijnEdge e : graph.edgesOf(v) ) {
|
||||
if( e.getIsRef() ) {
|
||||
isRefNode = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !isRefNode ) {
|
||||
done = false;
|
||||
verticesToRemove.add(v);
|
||||
}
|
||||
}
|
||||
}
|
||||
graph.removeAllVertices(verticesToRemove);
|
||||
verticesToRemove.clear();
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean createGraphFromSequences( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final ArrayList<GATKSAMRecord> reads, final int KMER_LENGTH, final Haplotype refHaplotype, final boolean DEBUG ) {
|
||||
final byte[] refSequence = refHaplotype.getBases();
|
||||
if( refSequence.length >= KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = refSequence.length - KMER_LENGTH + 1;
|
||||
for (int i = 0; i < kmersInSequence - 1; i++) {
|
||||
// get the kmers
|
||||
final byte[] kmer1 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(refSequence, i, kmer1, 0, KMER_LENGTH);
|
||||
final byte[] kmer2 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(refSequence, i+1, kmer2, 0, KMER_LENGTH);
|
||||
if( !addKmersToGraph(graph, kmer1, kmer2, true) ) {
|
||||
if( DEBUG ) {
|
||||
System.out.println("Cycle detected in reference graph for kmer = " + KMER_LENGTH + " ...skipping");
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for( final GATKSAMRecord read : reads ) {
|
||||
final byte[] sequence = read.getReadBases();
|
||||
final byte[] qualities = read.getBaseQualities();
|
||||
if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
|
||||
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
|
||||
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
|
||||
// if the qualities of all the bases in the kmers are high enough
|
||||
boolean badKmer = false;
|
||||
for( int jjj = iii; jjj < iii + KMER_LENGTH + 1; jjj++) {
|
||||
if( qualities[jjj] < MIN_QUALITY ) {
|
||||
badKmer = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if( !badKmer ) {
|
||||
// get the kmers
|
||||
final byte[] kmer1 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(sequence, iii, kmer1, 0, KMER_LENGTH);
|
||||
final byte[] kmer2 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(sequence, iii+1, kmer2, 0, KMER_LENGTH);
|
||||
|
||||
addKmersToGraph(graph, kmer1, kmer2, false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
protected static boolean addKmersToGraph( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph, final byte[] kmer1, final byte[] kmer2, final boolean isRef ) {
|
||||
|
||||
final int numVertexBefore = graph.vertexSet().size();
|
||||
final DeBruijnVertex v1 = new DeBruijnVertex( kmer1, kmer1.length );
|
||||
graph.addVertex(v1);
|
||||
final DeBruijnVertex v2 = new DeBruijnVertex( kmer2, kmer2.length );
|
||||
graph.addVertex(v2);
|
||||
if( isRef && graph.vertexSet().size() == numVertexBefore ) { return false; }
|
||||
|
||||
final DeBruijnEdge targetEdge = graph.getEdge(v1, v2);
|
||||
if ( targetEdge == null ) {
|
||||
graph.addEdge(v1, v2, new DeBruijnEdge( isRef ));
|
||||
} else {
|
||||
if( isRef ) {
|
||||
targetEdge.setIsRef( true );
|
||||
}
|
||||
targetEdge.setMultiplicity(targetEdge.getMultiplicity() + 1);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private void printGraphs() {
|
||||
int count = 0;
|
||||
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
|
||||
GRAPH_WRITER.println("digraph kmer" + count++ +" {");
|
||||
for( final DeBruijnEdge edge : graph.edgeSet() ) {
|
||||
if( edge.getMultiplicity() > PRUNE_FACTOR ) {
|
||||
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [" + (edge.getMultiplicity() <= PRUNE_FACTOR ? "style=dotted,color=grey" : "label=\""+ edge.getMultiplicity() +"\"") + "];");
|
||||
}
|
||||
if( edge.getIsRef() ) {
|
||||
GRAPH_WRITER.println("\t" + graph.getEdgeSource(edge).toString() + " -> " + graph.getEdgeTarget(edge).toString() + " [color=red];");
|
||||
}
|
||||
if( !edge.getIsRef() && edge.getMultiplicity() <= PRUNE_FACTOR ) { System.out.println("Graph pruning warning!"); }
|
||||
}
|
||||
for( final DeBruijnVertex v : graph.vertexSet() ) {
|
||||
final String label = ( graph.inDegreeOf(v) == 0 ? v.toString() : v.getSuffixString() );
|
||||
GRAPH_WRITER.println("\t" + v.toString() + " [label=\"" + label + "\"]");
|
||||
}
|
||||
GRAPH_WRITER.println("}");
|
||||
}
|
||||
}
|
||||
|
||||
@Ensures({"result.contains(refHaplotype)"})
|
||||
private ArrayList<Haplotype> findBestPaths( final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final ArrayList<VariantContext> activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) {
|
||||
final ArrayList<Haplotype> returnHaplotypes = new ArrayList<Haplotype>();
|
||||
|
||||
// add the reference haplotype separately from all the others
|
||||
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( fullReferenceWithPadding, refHaplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
refHaplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
|
||||
refHaplotype.setCigar( swConsensus.getCigar() );
|
||||
if( !returnHaplotypes.add( refHaplotype ) ) {
|
||||
throw new ReviewedStingException("Unable to add reference haplotype during assembly: " + refHaplotype);
|
||||
}
|
||||
|
||||
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
|
||||
final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength();
|
||||
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart());
|
||||
if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
|
||||
return returnHaplotypes;
|
||||
//throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for( final DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge> graph : graphs ) {
|
||||
for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) {
|
||||
final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() );
|
||||
if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) {
|
||||
if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final HashMap<Integer,VariantContext> eventMap = GenotypingEngine.generateVCsFromAlignment( h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly", 0 ); // BUGBUG: need to put this function in a shared place
|
||||
for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present
|
||||
final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart());
|
||||
if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) {
|
||||
for( final Allele compAltAllele : compVC.getAlternateAlleles() ) {
|
||||
addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( DEBUG ) {
|
||||
if( returnHaplotypes.size() > 1 ) {
|
||||
System.out.println("Found " + returnHaplotypes.size() + " candidate haplotypes to evaluate every read against.");
|
||||
} else {
|
||||
System.out.println("Found only the reference haplotype in the assembly graph.");
|
||||
}
|
||||
for( final Haplotype h : returnHaplotypes ) {
|
||||
System.out.println( h.toString() );
|
||||
System.out.println( "> Cigar = " + h.getCigar() );
|
||||
}
|
||||
}
|
||||
|
||||
return returnHaplotypes;
|
||||
}
|
||||
|
||||
private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList<Haplotype> haplotypeList, final int activeRegionStart, final int activeRegionStop ) {
|
||||
//final int sizeOfActiveRegion = activeRegionStop - activeRegionStart;
|
||||
final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
haplotype.setAlignmentStartHapwrtRef( swConsensus.getAlignmentStart2wrt1() );
|
||||
haplotype.setCigar( AlignmentUtils.leftAlignIndel(swConsensus.getCigar(), ref, haplotype.getBases(), swConsensus.getAlignmentStart2wrt1(), 0) );
|
||||
|
||||
if( swConsensus.getCigar().toString().contains("S") || swConsensus.getCigar().getReferenceLength() < 60 ) { // protect against SW failures
|
||||
return false;
|
||||
}
|
||||
|
||||
final int hapStart = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStart, ReadUtils.ClippingTail.LEFT_TAIL, true );
|
||||
int hapStop = ReadUtils.getReadCoordinateForReferenceCoordinate( haplotype.getAlignmentStartHapwrtRef(), haplotype.getCigar(), activeRegionStop, ReadUtils.ClippingTail.RIGHT_TAIL, true );
|
||||
if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED && activeRegionStop == haplotype.getAlignmentStartHapwrtRef() + haplotype.getCigar().getReferenceLength() ) {
|
||||
hapStop = activeRegionStop; // contract for getReadCoordinateForReferenceCoordinate function says that if read ends at boundary then it is outside of the clipping goal
|
||||
}
|
||||
byte[] newHaplotypeBases;
|
||||
// extend partial haplotypes to contain the full active region sequence
|
||||
int leftBreakPoint = 0;
|
||||
int rightBreakPoint = 0;
|
||||
if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED && hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()),
|
||||
haplotype.getBases()),
|
||||
ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
|
||||
leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart;
|
||||
rightBreakPoint = leftBreakPoint + haplotype.getBases().length;
|
||||
//newHaplotypeBases = haplotype.getBases();
|
||||
//return false; // piece of haplotype isn't anchored within the active region so don't build a haplotype out of it
|
||||
} else if( hapStart == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
//return false;
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(ref, activeRegionStart, swConsensus.getAlignmentStart2wrt1()), ArrayUtils.subarray(haplotype.getBases(), 0, hapStop) );
|
||||
//newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), 0, hapStop);
|
||||
leftBreakPoint = swConsensus.getAlignmentStart2wrt1() - activeRegionStart;
|
||||
} else if( hapStop == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) {
|
||||
//return false;
|
||||
newHaplotypeBases = ArrayUtils.addAll( ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length), ArrayUtils.subarray(ref, swConsensus.getAlignmentStart2wrt1() + swConsensus.getCigar().getReferenceLength(), activeRegionStop) );
|
||||
//newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, haplotype.getBases().length);
|
||||
rightBreakPoint = haplotype.getBases().length - hapStart;
|
||||
} else {
|
||||
newHaplotypeBases = ArrayUtils.subarray(haplotype.getBases(), hapStart, hapStop);
|
||||
}
|
||||
|
||||
final Haplotype h = new Haplotype( newHaplotypeBases );
|
||||
final SWPairwiseAlignment swConsensus2 = new SWPairwiseAlignment( ref, h.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND );
|
||||
|
||||
h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() );
|
||||
h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) );
|
||||
h.leftBreakPoint = leftBreakPoint;
|
||||
h.rightBreakPoint = rightBreakPoint;
|
||||
if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures
|
||||
return false;
|
||||
}
|
||||
|
||||
if( !haplotypeList.contains(h) ) {
|
||||
haplotypeList.add(h);
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
/**
|
||||
* Provides map/reduce application framework highly optimized for analysis of sequencing data.
|
||||
* @version 1.0
|
||||
*/
|
||||
package org.broadinstitute.sting;
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.bqsr;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* @author ebanks
|
||||
* @since 7/16/12
|
||||
*/
|
||||
public class BQSRIntegrationTest extends WalkerTest {
|
||||
|
||||
private static class BQSRTest {
|
||||
final String reference;
|
||||
final String interval;
|
||||
final String bam;
|
||||
final String args;
|
||||
final String md5;
|
||||
|
||||
private BQSRTest(String reference, String bam, String interval, String args, String md5) {
|
||||
this.reference = reference;
|
||||
this.bam = bam;
|
||||
this.interval = interval;
|
||||
this.args = args;
|
||||
this.md5 = md5;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("BQSR(bam='%s', args='%s')", bam, args);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "BQSRTest")
|
||||
public Object[][] createBQSRTestData() {
|
||||
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
|
||||
String HiSeqInterval = "chr1:10,000,000-10,100,000";
|
||||
return new Object[][]{
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "087dbc3e3f0cee6b891aecad2d0faf25")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "8a69591f728b3a2cdd79ff26bbebcc26")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "73d649bce0b69f56452de8c7e0a8686d")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "d9512cebf54ea120539059976b33d1ca")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "f61a8df03aae8c4273acf2b72497f154")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "7c2ce84e521d8f19fe5061b4e40317f7")},
|
||||
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "66a0caad65ab41d9013e812617e67370")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "6f5e9836147b488a7a204cffa5ecd21e")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "444fdfca7835e6a3714445f7e60abcaf")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "e0bfaf38f45142d45c8fe0ae05d0d4e0")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "5b30760bab51b4d1fc02097d4eacefa4")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "742fd8edfa36ab9023ceeaac40c4e215")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "6f5e9836147b488a7a204cffa5ecd21e")},
|
||||
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "22dd42897cf20852712c6e8f63195443")},
|
||||
};
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BQSRTest")
|
||||
public void testBQSR(BQSRTest params) {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
" -T BaseRecalibrator" +
|
||||
" -R " + params.reference +
|
||||
" -I " + params.bam +
|
||||
" -L " + params.interval +
|
||||
params.args +
|
||||
" --no_plots" +
|
||||
" -knownSites " + (params.reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) +
|
||||
" -o %s",
|
||||
Arrays.asList(params.md5));
|
||||
executeTest("testBQSR-"+params.args, spec).getFirst();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBQSRFailWithoutDBSNP() {
|
||||
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
|
||||
" -T BaseRecalibrator" +
|
||||
" -R " + b36KGReference +
|
||||
" -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" +
|
||||
" -L 1:10,000,000-10,200,000" +
|
||||
" --no_plots" +
|
||||
" -o %s",
|
||||
1, // just one output file
|
||||
UserException.CommandLineException.class);
|
||||
executeTest("testBQSRFailWithoutDBSNP", spec);
|
||||
}
|
||||
|
||||
private static class PRTest {
|
||||
final String args;
|
||||
final String md5;
|
||||
|
||||
private PRTest(String args, String md5) {
|
||||
this.args = args;
|
||||
this.md5 = md5;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("PrintReads(args='%s')", args);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "PRTest")
|
||||
public Object[][] createPRTestData() {
|
||||
return new Object[][]{
|
||||
{new PRTest("", "d2d6ed8667cdba7e56f5db97d6262676")},
|
||||
{new PRTest(" -qq -1", "b7053d3d67aba6d8892f0a60f0ded338")},
|
||||
{new PRTest(" -qq 6", "bfbf0855185b2b70aa35237fb71e4487")},
|
||||
{new PRTest(" -DIQ", "66aa65223f192ee39c1773aa187fd493")}
|
||||
};
|
||||
}
|
||||
|
||||
@Test(dataProvider = "PRTest")
|
||||
public void testPR(PRTest params) {
|
||||
WalkerTestSpec spec = new WalkerTestSpec(
|
||||
"-T PrintReads" +
|
||||
" -R " + hg18Reference +
|
||||
" -I " + privateTestDir + "HiSeq.1mb.1RG.bam" +
|
||||
" -BQSR " + privateTestDir + "HiSeq.1mb.1RG.table" +
|
||||
params.args +
|
||||
" -o %s",
|
||||
Arrays.asList(params.md5));
|
||||
executeTest("testPrintReads-"+params.args, spec).getFirst();
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Basic unit test for BaseCounts in reduced reads
|
||||
*/
|
||||
public class BaseCountsUnitTest extends BaseTest {
|
||||
private class SingleTest {
|
||||
public String bases;
|
||||
public byte mostCountBase;
|
||||
public int mostCommonCount;
|
||||
|
||||
private SingleTest(String bases, char mostCountBase, int mostCommonCount) {
|
||||
this.mostCommonCount = mostCommonCount;
|
||||
this.mostCountBase = (byte)mostCountBase;
|
||||
this.bases = bases;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@DataProvider(name = "data")
|
||||
public Object[][] createData1() {
|
||||
List<SingleTest> params = new ArrayList<SingleTest>();
|
||||
|
||||
params.add(new SingleTest("A", 'A', 1 ));
|
||||
params.add(new SingleTest("AA", 'A', 2 ));
|
||||
params.add(new SingleTest("AC", 'A', 1 ));
|
||||
params.add(new SingleTest("AAC", 'A', 2 ));
|
||||
params.add(new SingleTest("AAA", 'A', 3 ));
|
||||
params.add(new SingleTest("AAAN", 'A', 3 ));
|
||||
params.add(new SingleTest("AAANNNN", 'N', 4 ));
|
||||
params.add(new SingleTest("AACTG", 'A', 2 ));
|
||||
params.add(new SingleTest("D", 'D', 1 ));
|
||||
params.add(new SingleTest("DDAAD", 'D', 3));
|
||||
params.add(new SingleTest("", (char)BaseCounts.MAX_BASE_WITH_NO_COUNTS, 0 ));
|
||||
params.add(new SingleTest("AAIIIAI", 'I', 4 ));
|
||||
|
||||
List<Object[]> params2 = new ArrayList<Object[]>();
|
||||
for ( SingleTest x : params ) params2.add(new Object[]{x});
|
||||
return params2.toArray(new Object[][]{});
|
||||
}
|
||||
|
||||
|
||||
|
||||
@Test(dataProvider = "data", enabled = true)
|
||||
public void testCounting(SingleTest params) {
|
||||
BaseCounts counts = new BaseCounts();
|
||||
|
||||
for ( byte base : params.bases.getBytes() )
|
||||
counts.incr(base);
|
||||
|
||||
String name = String.format("Test-%s", params.bases);
|
||||
Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
|
||||
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name);
|
||||
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class ReduceReadsIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
final String BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final String DELETION_BAM = validationDataLocation + "filtered_deletion_for_reduce_reads.bam";
|
||||
final String STASH_BAM = validationDataLocation + "ReduceReadsStashBug.bam";
|
||||
final String STASH_L = " -L 14:73718184-73718284 -L 14:73718294-73718330 -L 14:73718360-73718556";
|
||||
final String L = " -L 20:10,100,000-10,120,000 ";
|
||||
|
||||
private void RRTest(String testName, String args, String md5) {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, BAM) + " -o %s ";
|
||||
WalkerTestSpec spec = new WalkerTestSpec(base + args, Arrays.asList(md5));
|
||||
executeTest(testName, spec);
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testDefaultCompression() {
|
||||
RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testMultipleIntervals() {
|
||||
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
|
||||
RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testHighCompression() {
|
||||
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testLowCompression() {
|
||||
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "afd39459c841b68a442abdd5ef5f8f27");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testIndelCompression() {
|
||||
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973");
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testFilteredDeletionCompression() {
|
||||
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
|
||||
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
|
||||
}
|
||||
|
||||
/**
|
||||
* Bug reported by Adam where a read that got clipped before actually belongs 2 intervals ahead
|
||||
* and a subsequent tail leaves only this read in the stash. The next read to come in is in fact
|
||||
* before (alignment start) than this read, so the TreeSet breaks with a Key out of Range error
|
||||
* that was freaking hard to catch.
|
||||
*
|
||||
* This bam is simplified to replicate the exact bug with the three provided intervals.
|
||||
*/
|
||||
@Test(enabled = true)
|
||||
public void testAddingReadAfterTailingTheStash() {
|
||||
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
|
||||
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6")));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
|
||||
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Random;
|
||||
|
||||
public class SyntheticReadUnitTest extends BaseTest {
|
||||
final SAMFileHeader artificialSAMHeader = ArtificialSAMUtils.createArtificialSamHeader(1, 1, 1);
|
||||
final GATKSAMReadGroupRecord artificialGATKRG = new GATKSAMReadGroupRecord("synthetic");
|
||||
final String artificialContig = "1";
|
||||
final int artificialContigIndex = 0;
|
||||
final String artificialReadName = "synth";
|
||||
final int artificialRefStart = 1;
|
||||
final double artificialMappingQuality = 60;
|
||||
|
||||
final Random random = new Random(8854875);
|
||||
|
||||
|
||||
@Test
|
||||
public void testBaseCounts() {
|
||||
BaseIndex [] bases = new BaseIndex[] {BaseIndex.A,BaseIndex.A,BaseIndex.A,BaseIndex.A};
|
||||
Byte[] quals = new Byte[] {20, 20, 20, 20 };
|
||||
|
||||
TestRead [] testReads = new TestRead [] {
|
||||
new TestRead(bases, quals, new Byte[] {100, 100, 100, 101}, new byte [] {100, 0, 0, 1}),
|
||||
new TestRead(bases, quals, new Byte[] {1, 100, 100, 0}, new byte [] {1, 99, 99, -1}),
|
||||
new TestRead(bases, quals, new Byte[] {127, 100, 0, 1}, new byte [] {127, -27, -127, -126}),
|
||||
new TestRead(bases, quals, new Byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})};
|
||||
|
||||
for (TestRead testRead : testReads) {
|
||||
SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false);
|
||||
Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts());
|
||||
}
|
||||
}
|
||||
|
||||
private class TestRead {
|
||||
BaseIndex[] bases;
|
||||
Byte[] quals;
|
||||
Byte[] insQuals;
|
||||
Byte[] delQuals;
|
||||
Byte[] counts;
|
||||
byte [] expectedCounts;
|
||||
|
||||
private TestRead(BaseIndex[] bases, Byte[] quals, Byte[] counts, byte[] expectedCounts) {
|
||||
this.bases = bases;
|
||||
this.quals = quals;
|
||||
this.insQuals = quals;
|
||||
this.delQuals = quals;
|
||||
this.counts = counts;
|
||||
this.expectedCounts = expectedCounts;
|
||||
}
|
||||
|
||||
public BaseIndex[] getBases() {
|
||||
return bases;
|
||||
}
|
||||
|
||||
public Byte[] getQuals() {
|
||||
return quals;
|
||||
}
|
||||
|
||||
public Byte[] getInsQuals() {
|
||||
return insQuals;
|
||||
}
|
||||
|
||||
public Byte[] getDelQuals() {
|
||||
return delQuals;
|
||||
}
|
||||
|
||||
public Byte[] getCounts() {
|
||||
return counts;
|
||||
}
|
||||
|
||||
public byte[] getExpectedCounts() {
|
||||
return expectedCounts;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Genotype;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypesContext;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeSuite;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 3/28/12
|
||||
* Time: 7:44 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PoolAFCalculationModelUnitTest extends BaseTest {
|
||||
|
||||
static double[] AA1, AB1, BB1;
|
||||
static double[] AA2, AB2, AC2, BB2, BC2, CC2;
|
||||
static double[] A4_1, B4_1, C4_1, D4_1, E4_1,F4_1;
|
||||
static double[] A4_400, B4_310, C4_220, D4_130, E4_121, F4_013;
|
||||
static final int numSamples = 4;
|
||||
static final int samplePloidy = 4; // = 2*samplesPerPool
|
||||
|
||||
@BeforeSuite
|
||||
public void before() {
|
||||
// legacy diploid cases
|
||||
AA1 = new double[]{-5.0, -20.0, -20.0};
|
||||
AB1 = new double[]{-20.0, 0.0, -20.0};
|
||||
BB1 = new double[]{-20.0, -20.0, 0.0};
|
||||
|
||||
// diploid, nAlleles = 3. Ordering is [2 0 0] [1 1 0] [0 2 0] [1 0 1] [0 1 1] [0 0 2], ie AA AB BB AC BC CC
|
||||
AA2 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AB2 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0};
|
||||
AC2 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
BB2 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
BC2 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
CC2 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
|
||||
// pool (i.e. polyploid cases)
|
||||
// NAlleles = 2, ploidy=4
|
||||
// ordering is [4 0] [3 1] [2 2 ] [1 3] [0 4]
|
||||
|
||||
A4_1 = new double[]{-3.0, -20.0, -20.0, -20.0, -20.0};
|
||||
B4_1 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0};
|
||||
C4_1 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0};
|
||||
D4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0};
|
||||
E4_1 = new double[]{-20.0, -20.0, 0.0, 0.0, -20.0};
|
||||
F4_1 = new double[]{-20.0, -20.0, -20.0, -20.0, 0.0};
|
||||
|
||||
// NAlleles = 3, ploidy = 4
|
||||
// ordering is [4 0 0] [3 1 0] [2 2 0] [1 3 0] [0 4 0] [3 0 1] [2 1 1] [1 2 1] [0 3 1] [2 0 2] [1 1 2] [0 2 2] [1 0 3] [0 1 3] [0 0 4]
|
||||
A4_400 = new double[]{0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
B4_310 = new double[]{-20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
C4_220 = new double[]{-20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
D4_130 = new double[]{-20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
E4_121 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0};
|
||||
F4_013 = new double[]{-20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, 0.0, -20.0};
|
||||
|
||||
}
|
||||
|
||||
private class GetGLsTest extends TestDataProvider {
|
||||
GenotypesContext GLs;
|
||||
int numAltAlleles;
|
||||
String name;
|
||||
int ploidy;
|
||||
private GetGLsTest(String name, int numAltAlleles, int ploidy, Genotype... arg) {
|
||||
super(GetGLsTest.class, name);
|
||||
GLs = GenotypesContext.create(arg);
|
||||
this.name = name;
|
||||
this.numAltAlleles = numAltAlleles;
|
||||
this.ploidy = ploidy;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s input=%s", super.toString(), GLs);
|
||||
}
|
||||
}
|
||||
|
||||
private static Genotype createGenotype(String name, double[] gls, int ploidy) {
|
||||
Allele[] alleles = new Allele[ploidy];
|
||||
|
||||
for (int i=0; i < ploidy; i++)
|
||||
alleles[i] = Allele.NO_CALL;
|
||||
|
||||
return new GenotypeBuilder(name, Arrays.asList(alleles)).PL(gls).make();
|
||||
}
|
||||
|
||||
@DataProvider(name = "getGLs")
|
||||
public Object[][] createGLsData() {
|
||||
|
||||
// bi-allelic diploid case
|
||||
new GetGLsTest("B0", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AA3", AA1,2));
|
||||
new GetGLsTest("B1", 1, 2, createGenotype("AA1", AA1,2), createGenotype("AA2", AA1,2), createGenotype("AB", AB1,2));
|
||||
new GetGLsTest("B2", 1, 2, createGenotype("AA1", AA1,2), createGenotype("BB", BB1,2), createGenotype("AA2", AA1,2));
|
||||
new GetGLsTest("B3a", 1, 2, createGenotype("AB", AB1,2), createGenotype("AA", AA1,2), createGenotype("BB", BB1,2));
|
||||
new GetGLsTest("B3b", 1, 2, createGenotype("AB1", AB1,2), createGenotype("AB2", AB1,2), createGenotype("AB3", AB1,2));
|
||||
new GetGLsTest("B4", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("AA", AA1,2));
|
||||
new GetGLsTest("B5", 1, 2, createGenotype("BB1", BB1,2), createGenotype("AB", AB1,2), createGenotype("BB2", BB1,2));
|
||||
new GetGLsTest("B6", 1, 2, createGenotype("BB1", BB1,2), createGenotype("BB2", BB1,2), createGenotype("BB3", BB1,2));
|
||||
|
||||
// tri-allelic diploid case
|
||||
new GetGLsTest("B1C0", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AB", AB2,2));
|
||||
new GetGLsTest("B0C1", 2, 2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("AC", AC2,2));
|
||||
new GetGLsTest("B1C1a", 2,2, createGenotype("AA", AA2,2), createGenotype("AB", AB2,2), createGenotype("AC", AC2,2));
|
||||
new GetGLsTest("B1C1b", 2,2, createGenotype("AA1", AA2,2), createGenotype("AA2", AA2,2), createGenotype("BC", BC2,2));
|
||||
new GetGLsTest("B2C1", 2, 2, createGenotype("AB1", AB2,2), createGenotype("AB2", AB2,2), createGenotype("AC", AC2,2));
|
||||
new GetGLsTest("B3C2a", 2, 2, createGenotype("AB", AB2,2), createGenotype("BC1", BC2,2), createGenotype("BC2", BC2,2));
|
||||
new GetGLsTest("B3C2b", 2, 2, createGenotype("AB", AB2,2), createGenotype("BB", BB2,2), createGenotype("CC", CC2,2));
|
||||
|
||||
// bi-allelic pool case
|
||||
new GetGLsTest("P0", 1, samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P1", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P2a", 1,samplePloidy, createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P2b", 1, samplePloidy,createGenotype("B4_1", B4_1,samplePloidy), createGenotype("B4_1", B4_1,samplePloidy), createGenotype("A4_1", A4_1,samplePloidy));
|
||||
new GetGLsTest("P4", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy));
|
||||
new GetGLsTest("P6", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("C4_1", C4_1,samplePloidy));
|
||||
new GetGLsTest("P8", 1, samplePloidy,createGenotype("A4_1", A4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy), createGenotype("F4_1", F4_1,samplePloidy));
|
||||
|
||||
// multi-allelic pool case
|
||||
new GetGLsTest("B1C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B3C9", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B6C0", 2, samplePloidy,createGenotype("B4_310", B4_310,samplePloidy), createGenotype("C4_220", C4_220,samplePloidy), createGenotype("D4_130", D4_130,samplePloidy));
|
||||
new GetGLsTest("B6C4", 2, samplePloidy,createGenotype("D4_130", D4_130,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B4C7", 2, samplePloidy,createGenotype("F4_013", F4_013,samplePloidy), createGenotype("E4_121", E4_121,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy));
|
||||
new GetGLsTest("B2C3", 2, samplePloidy,createGenotype("A4_400", A4_400,samplePloidy), createGenotype("F4_013", F4_013,samplePloidy), createGenotype("B4_310", B4_310,samplePloidy));
|
||||
|
||||
return GetGLsTest.getTests(GetGLsTest.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "getGLs")
|
||||
public void testGLs(GetGLsTest cfg) {
|
||||
|
||||
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
|
||||
final int len = PoolGenotypeLikelihoods.getNumLikelihoodElements(1+cfg.numAltAlleles,cfg.ploidy*cfg.GLs.size());
|
||||
double[] priors = new double[len]; // flat priors
|
||||
|
||||
PoolAFCalculationModel.combineSinglePools(cfg.GLs, 1+cfg.numAltAlleles, cfg.ploidy, priors, result);
|
||||
int nameIndex = 1;
|
||||
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
|
||||
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
|
||||
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele];
|
||||
|
||||
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
|
||||
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,61 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
|
||||
import java.util.Arrays;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: delangel
|
||||
* Date: 4/5/12
|
||||
* Time: 11:28 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class PoolCallerIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
final String CEUTRIO_BAM = "/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37.list";
|
||||
final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
|
||||
final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
|
||||
final String REFSAMPLE_NAME = "NA12878";
|
||||
final String MTINTERVALS = "MT";
|
||||
final String LSVINTERVALS = "20:40,000,000-41,000,000";
|
||||
final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
|
||||
final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
|
||||
final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
|
||||
private void PC_MT_Test(String bam, String args, String name, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm POOLSNP -ignoreLane -pnrm POOL",
|
||||
REF, bam, MTINTERVALS, REFSAMPLE_MT_CALLS, REFSAMPLE_NAME) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
private void PC_LSV_Test(String args, String name, String model, String md5) {
|
||||
final String base = String.format("-T UnifiedGenotyper -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane -pnrm POOL",
|
||||
REF, LSV_BAM, LSVINTERVALS, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testPoolCaller:"+name+" args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBOTH_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLBOTH","36b8db57f65be1cc3d2d9d7f9f3f26e4");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testINDEL_GGA_Pools() {
|
||||
PC_LSV_Test(String.format(" -maxAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","POOLINDEL","d1339990291648495bfcf4404f051478");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMT_SNP_DISCOVERY_sp4() {
|
||||
PC_MT_Test(CEUTRIO_BAM, " -maxAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","fa5ee7c957c473a80f3a7f3c35dc80b5");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMT_SNP_GGA_sp10() {
|
||||
|
||||
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "6907c8617d49bb57b33f8704ce7f0323");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,502 @@
|
|||
/*
|
||||
* Copyright (c) 2010.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
package org.broadinstitute.sting.gatk.walkers.genotyper;
|
||||
|
||||
import net.sf.samtools.SAMUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.BaseUtils;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.variantcontext.*;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
public class PoolGenotypeLikelihoodsUnitTest {
|
||||
|
||||
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection();
|
||||
final Logger logger = Logger.getLogger(Walker.class);
|
||||
private static final boolean VERBOSE = false;
|
||||
private static final boolean SIMULATE_NOISY_PILEUP = false;
|
||||
private static final int NUM_SIMULATED_OBS = 10;
|
||||
|
||||
@Test
|
||||
public void testStoringLikelihoodElements() {
|
||||
|
||||
|
||||
// basic test storing a given PL vector in a PoolGenotypeLikelihoods object and then retrieving it back
|
||||
|
||||
int ploidy = 20;
|
||||
int numAlleles = 4;
|
||||
int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy);
|
||||
// System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res);
|
||||
|
||||
List<Allele> alleles = new ArrayList<Allele>();
|
||||
alleles.add(Allele.create("T",true));
|
||||
alleles.add(Allele.create("C",false));
|
||||
alleles.add(Allele.create("A",false));
|
||||
alleles.add(Allele.create("G",false));
|
||||
|
||||
double[] gls = new double[res];
|
||||
|
||||
for (int k=0; k < gls.length; k++)
|
||||
gls[k]= (double)k;
|
||||
|
||||
PoolGenotypeLikelihoods gl = new PoolSNPGenotypeLikelihoods(alleles, gls,ploidy, null, false,true);
|
||||
double[] glnew = gl.getLikelihoods();
|
||||
|
||||
Assert.assertEquals(gls, glnew);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testElementStorageCache() {
|
||||
// compare cached element storage with compuationally hard-coded iterative computation
|
||||
|
||||
for (int ploidy = 2; ploidy < 10; ploidy++) {
|
||||
for (int nAlleles = 2; nAlleles < 10; nAlleles++)
|
||||
Assert.assertEquals(PoolGenotypeLikelihoods.getNumLikelihoodElements(nAlleles,ploidy),
|
||||
GenotypeLikelihoods.numLikelihoods(nAlleles, ploidy));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVectorToLinearIndex() {
|
||||
|
||||
// create iterator, compare linear index given by iterator with closed form function
|
||||
int numAlleles = 4;
|
||||
int ploidy = 2;
|
||||
PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(numAlleles, ploidy);
|
||||
|
||||
while(iterator.hasNext()) {
|
||||
System.out.format("\n%d:",iterator.getLinearIndex());
|
||||
int[] a = iterator.getCurrentVector();
|
||||
for (int aa: a)
|
||||
System.out.format("%d ",aa);
|
||||
|
||||
|
||||
int computedIdx = PoolGenotypeLikelihoods.getLinearIndex(a, numAlleles, ploidy);
|
||||
System.out.format("Computed idx = %d\n",computedIdx);
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
}
|
||||
@Test
|
||||
public void testSubsetToAlleles() {
|
||||
|
||||
int ploidy = 2;
|
||||
int numAlleles = 4;
|
||||
int res = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy);
|
||||
// System.out.format("Alt Alleles: %d, Ploidy: %d, #Likelihoods: %d\n", numAltAlleles, ploidy, res);
|
||||
|
||||
List<Allele> originalAlleles = new ArrayList<Allele>();
|
||||
originalAlleles.add(Allele.create("T",true));
|
||||
originalAlleles.add(Allele.create("C",false));
|
||||
originalAlleles.add(Allele.create("A",false));
|
||||
originalAlleles.add(Allele.create("G",false));
|
||||
|
||||
double[] oldLikelihoods = new double[res];
|
||||
|
||||
for (int k=0; k < oldLikelihoods.length; k++)
|
||||
oldLikelihoods[k]= (double)k;
|
||||
|
||||
List<Allele> allelesToSubset = new ArrayList<Allele>();
|
||||
allelesToSubset.add(Allele.create("A",false));
|
||||
allelesToSubset.add(Allele.create("C",false));
|
||||
|
||||
double[] newGLs = PoolGenotypeLikelihoods.subsetToAlleles(oldLikelihoods, ploidy,
|
||||
originalAlleles, allelesToSubset);
|
||||
|
||||
|
||||
/*
|
||||
For P=2, N=4, default iteration order:
|
||||
0:2 0 0 0
|
||||
1:1 1 0 0
|
||||
2:0 2 0 0
|
||||
3:1 0 1 0
|
||||
4:0 1 1 0
|
||||
5:0 0 2 0
|
||||
6:1 0 0 1
|
||||
7:0 1 0 1
|
||||
8:0 0 1 1
|
||||
9:0 0 0 2
|
||||
|
||||
For P=2,N=2, iteration order is:
|
||||
0:2 0
|
||||
1:1 1
|
||||
2:0 2
|
||||
|
||||
From first list, if we're extracting alleles 2 and 1, we need all elements that have zero at positions 0 and 3.
|
||||
These are only elements {2,4,5}. Since test is flipping alleles 2 and 1, order is reversed.
|
||||
*/
|
||||
Assert.assertEquals(newGLs,new double[]{5.0,4.0,2.0});
|
||||
}
|
||||
@Test
|
||||
public void testIndexIterator() {
|
||||
int[] seed = new int[]{1,2,3,4};
|
||||
PoolGenotypeLikelihoods.SumIterator iterator = runIterator(seed,-1);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
|
||||
|
||||
seed = new int[]{1,0,1,1};
|
||||
iterator = runIterator(seed,-1);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
|
||||
|
||||
seed = new int[]{5};
|
||||
iterator = runIterator(seed,-1);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),prod(seed)-1);
|
||||
|
||||
// Diploid, # alleles = 4
|
||||
seed = new int[]{2,2,2,2};
|
||||
iterator = runIterator(seed,2);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),9);
|
||||
|
||||
// Diploid, # alleles = 2
|
||||
seed = new int[]{2,2};
|
||||
iterator = runIterator(seed,2);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),2);
|
||||
|
||||
// Diploid, # alleles = 3
|
||||
seed = new int[]{2,2,2};
|
||||
iterator = runIterator(seed,2);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),5);
|
||||
|
||||
// Triploid, # alleles = 2
|
||||
seed = new int[]{3,3};
|
||||
iterator = runIterator(seed,3);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),3);
|
||||
// Triploid, # alleles = 3
|
||||
seed = new int[]{3,3,3};
|
||||
iterator = runIterator(seed,3);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),9);
|
||||
|
||||
// Triploid, # alleles = 4
|
||||
seed = new int[]{3,3,3,3};
|
||||
iterator = runIterator(seed,3);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),19);
|
||||
|
||||
// 8-ploid, # alleles = 6
|
||||
seed = new int[]{8,8,8,8,8,8};
|
||||
iterator = runIterator(seed,8);
|
||||
// Assert.assertTrue(compareIntArrays(iterator.getCurrentVector(), seed));
|
||||
Assert.assertEquals(iterator.getLinearIndex(),1286);
|
||||
|
||||
|
||||
}
|
||||
|
||||
private PoolGenotypeLikelihoods.SumIterator runIterator(int[] seed, int restrictSumTo) {
|
||||
PoolGenotypeLikelihoods.SumIterator iterator = new PoolGenotypeLikelihoods.SumIterator(seed, restrictSumTo);
|
||||
|
||||
while(iterator.hasNext()) {
|
||||
int[] a = iterator.getCurrentVector();
|
||||
int idx = PoolGenotypeLikelihoods.getLinearIndex(a, a.length, restrictSumTo);
|
||||
if (VERBOSE) {
|
||||
System.out.format("%d:",iterator.getLinearIndex());
|
||||
for (int i=0; i < seed.length; i++)
|
||||
System.out.format("%d ",a[i]);
|
||||
System.out.format(" LI:%d\n", idx);
|
||||
}
|
||||
iterator.next();
|
||||
}
|
||||
|
||||
return iterator;
|
||||
|
||||
}
|
||||
|
||||
private static int prod(int[] x) {
|
||||
int prod = 1;
|
||||
for (int xx : x) {
|
||||
prod *= (1+xx);
|
||||
}
|
||||
return prod;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testErrorModel() {
|
||||
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
|
||||
final byte minQ = 5;
|
||||
final byte maxQ = 40;
|
||||
final byte refByte = refPileupTestProvider.getRefByte();
|
||||
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
|
||||
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
|
||||
final List<Allele> trueAlleles = new ArrayList<Allele>();
|
||||
trueAlleles.add(Allele.create(refByte, true));
|
||||
|
||||
final VariantContext refVC = new VariantContextBuilder("test","chr1",5, 5,
|
||||
trueAlleles).genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).make();
|
||||
final int[] matchArray = {95, 995, 9995, 10000};
|
||||
final int[] mismatchArray = {1,5,10,20};
|
||||
if (VERBOSE) System.out.println("Running SNP error model test");
|
||||
|
||||
for (int matches: matchArray) {
|
||||
for (int mismatches: mismatchArray) {
|
||||
// get artificial alignment context for ref sample - no noise
|
||||
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(0, new String(new byte[]{altByte}), new int[]{matches, mismatches}, false, 30);
|
||||
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
|
||||
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refVC, 0.0);
|
||||
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
|
||||
|
||||
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
|
||||
final int peakIdx = (int)Math.round(mlEst);
|
||||
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d maxV:%d peakIdx:%d\n",matches, mismatches, MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIndelErrorModel() {
|
||||
final ArtificialReadPileupTestProvider refPileupTestProvider = new ArtificialReadPileupTestProvider(1,"ref");
|
||||
final byte minQ = 5;
|
||||
final byte maxQ = 40;
|
||||
final byte refByte = refPileupTestProvider.getRefByte();
|
||||
final String altBases = "TCA";
|
||||
final String refSampleName = refPileupTestProvider.getSampleNames().get(0);
|
||||
final List<Allele> trueAlleles = new ArrayList<Allele>();
|
||||
trueAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, true));
|
||||
trueAlleles.add(Allele.create("TC", false));
|
||||
|
||||
final String fw = new String(refPileupTestProvider.getReferenceContext().getForwardBases());
|
||||
final VariantContext refInsertionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
|
||||
refPileupTestProvider.getReferenceContext().getLocus().getStart(), trueAlleles).
|
||||
genotypes(GenotypeBuilder.create(refSampleName, trueAlleles)).referenceBaseForIndel(refByte).make();
|
||||
|
||||
|
||||
final int[] matchArray = {95, 995, 9995, 10000};
|
||||
final int[] mismatchArray = {1,5,10,20};
|
||||
|
||||
if (VERBOSE) System.out.println("Running indel error model test");
|
||||
for (int matches: matchArray) {
|
||||
for (int mismatches: mismatchArray) {
|
||||
// get artificial alignment context for ref sample - no noise
|
||||
// CASE 1: Test HET insertion
|
||||
// Ref sample has TC insertion but pileup will have TCA inserted instead to test mismatches
|
||||
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(altBases.length(), altBases, new int[]{matches, mismatches}, false, 30);
|
||||
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
|
||||
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refInsertionVC, 0.0);
|
||||
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
|
||||
|
||||
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
|
||||
final int peakIdx = (int)Math.round(mlEst);
|
||||
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx);
|
||||
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
|
||||
// CASE 2: Test HET deletion
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// create deletion VC
|
||||
final int delLength = 4;
|
||||
final List<Allele> delAlleles = new ArrayList<Allele>();
|
||||
delAlleles.add(Allele.create(fw.substring(1,delLength+1), true));
|
||||
delAlleles.add(Allele.create(Allele.NULL_ALLELE_STRING, false));
|
||||
|
||||
final VariantContext refDeletionVC = new VariantContextBuilder("test","chr1",refPileupTestProvider.getReferenceContext().getLocus().getStart(),
|
||||
refPileupTestProvider.getReferenceContext().getLocus().getStart()+delLength, delAlleles).
|
||||
genotypes(GenotypeBuilder.create(refSampleName, delAlleles)).referenceBaseForIndel(refByte).make();
|
||||
|
||||
for (int matches: matchArray) {
|
||||
for (int mismatches: mismatchArray) {
|
||||
// get artificial alignment context for ref sample - no noise
|
||||
// CASE 1: Test HET deletion
|
||||
// Ref sample has 4bp deletion but pileup will have 3 bp deletion instead to test mismatches
|
||||
Map<String,AlignmentContext> refContext = refPileupTestProvider.getAlignmentContextFromAlleles(-delLength+1, altBases, new int[]{matches, mismatches}, false, 30);
|
||||
final ReadBackedPileup refPileup = refContext.get(refSampleName).getBasePileup();
|
||||
final ErrorModel emodel = new ErrorModel(minQ,maxQ, (byte)20, refPileup, refDeletionVC, 0.0);
|
||||
final double[] errorVec = emodel.getErrorModelVector().getProbabilityVector();
|
||||
|
||||
final double mlEst = -10.0*Math.log10((double)mismatches/(double)(matches+mismatches));
|
||||
final int peakIdx = (int)Math.round(mlEst);
|
||||
if (VERBOSE) System.out.format("Matches:%d Mismatches:%d peakIdx:%d\n",matches, mismatches, peakIdx);
|
||||
Assert.assertEquals(MathUtils.maxElementIndex(errorVec),peakIdx);
|
||||
|
||||
// CASE 2: Test HET deletion
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAddPileupToPoolGL() {
|
||||
|
||||
// dummy error model - Q=infinity FAPP so that there's no source of uncertainty
|
||||
final double[] emv = new double[SAMUtils.MAX_PHRED_SCORE+1];
|
||||
|
||||
// error rate for noisy tests
|
||||
final int PHRED_SITE_ERROR_RATE = 20;
|
||||
|
||||
Arrays.fill(emv, Double.NEGATIVE_INFINITY);
|
||||
emv[SAMUtils.MAX_PHRED_SCORE] = 0;
|
||||
|
||||
final int numSamples = 1;
|
||||
|
||||
// have a high quality site say Q40 site, and create artificial pileups for one single sample, at coverage N, with given
|
||||
// true pool AC = x.
|
||||
|
||||
final ArtificialReadPileupTestProvider readPileupTestProvider = new ArtificialReadPileupTestProvider(numSamples,"sample", (byte)SAMUtils.MAX_PHRED_SCORE);
|
||||
final ErrorModel noiselessErrorModel = new ErrorModel(emv);
|
||||
|
||||
final double[] emverr = new double[SAMUtils.MAX_PHRED_SCORE+1];
|
||||
Arrays.fill(emverr, Double.NEGATIVE_INFINITY);
|
||||
emverr[PHRED_SITE_ERROR_RATE] = 0;
|
||||
final ErrorModel Q30ErrorModel = new ErrorModel(emverr);
|
||||
|
||||
|
||||
final int eventLength = 0; // test snp only
|
||||
final byte refByte = readPileupTestProvider.getRefByte();
|
||||
final byte altByte = refByte == (byte)'T'? (byte) 'C': (byte)'T';
|
||||
|
||||
final int refIdx = BaseUtils.simpleBaseToBaseIndex(refByte);
|
||||
final int altIdx = BaseUtils.simpleBaseToBaseIndex(altByte);
|
||||
|
||||
final List<Allele> allAlleles = new ArrayList<Allele>(); // this contains only ref Allele up to now
|
||||
final Set<String> laneIDs = new TreeSet<String>();
|
||||
laneIDs.add(GenotypeLikelihoodsCalculationModel.DUMMY_LANE);
|
||||
|
||||
final HashMap<String, ErrorModel> noiselessErrorModels = new HashMap<String, ErrorModel>();
|
||||
|
||||
// build per-lane error model for all lanes present in ref sample
|
||||
for (String laneID : laneIDs)
|
||||
noiselessErrorModels.put(laneID, noiselessErrorModel);
|
||||
|
||||
final HashMap<String, ErrorModel> noisyErrorModels = new HashMap<String, ErrorModel>();
|
||||
|
||||
// build per-lane error model for all lanes present in ref sample
|
||||
for (String laneID : laneIDs)
|
||||
noisyErrorModels.put(laneID, Q30ErrorModel);
|
||||
|
||||
for (byte b: BaseUtils.BASES) {
|
||||
if (refByte == b)
|
||||
allAlleles.add(Allele.create(b,true));
|
||||
else
|
||||
allAlleles.add(Allele.create(b, false));
|
||||
}
|
||||
|
||||
PrintStream out = null;
|
||||
if (SIMULATE_NOISY_PILEUP) {
|
||||
try {
|
||||
out = new PrintStream(new File("/humgen/gsa-scr1/delangel/GATK/Sting_unstable_mac/GLUnitTest.table"));
|
||||
// out = new PrintStream(new File("/Users/delangel/GATK/Sting_unstable/GLUnitTest.table"));
|
||||
}
|
||||
catch (Exception e) {}
|
||||
// write header
|
||||
out.format("Depth\tPoolPloidy\tACTrue\tACEst\tREF\tALTTrue\tALTEst\n");
|
||||
}
|
||||
final int[] depthVector = {1000,10000};
|
||||
//final double[] alleleFrequencyVector = {0.01,0.1,0.5,1.0};
|
||||
final int[] spVector = {10,100};
|
||||
//final int[] spVector = {1};
|
||||
for (int depth : depthVector) {
|
||||
for (int nSamplesPerPool : spVector) {
|
||||
final int ploidy = 2*nSamplesPerPool;
|
||||
for (int ac =2; ac <=ploidy; ac++) {
|
||||
|
||||
// simulate pileup with given AC and depth
|
||||
int altDepth = (int)Math.round( (double)ac/(double)ploidy * (double)depth);
|
||||
final int[] numReadsPerAllele = {depth-altDepth,altDepth};
|
||||
final Map<String,AlignmentContext> alignmentContextMap =
|
||||
readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele);
|
||||
|
||||
// get now likelihoods for this
|
||||
|
||||
final PoolSNPGenotypeLikelihoods GL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noiselessErrorModels, false, true);
|
||||
final int nGoodBases = GL.add(alignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
|
||||
if (VERBOSE) {
|
||||
System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\nGLs:", depth,ac,altDepth, nSamplesPerPool);
|
||||
System.out.println(GL.toString());
|
||||
}
|
||||
Assert.assertEquals(nGoodBases, depth);
|
||||
Pair<int[],Double> mlPair = GL.getMostLikelyACCount();
|
||||
|
||||
// Most likely element has to be conformation REF = nSamples-AC,ALT = AC
|
||||
if (ac == 0) {
|
||||
Assert.assertEquals(mlPair.first[refIdx],ploidy);
|
||||
} else {
|
||||
Assert.assertEquals(mlPair.first[altIdx],ac);
|
||||
Assert.assertEquals(mlPair.first[refIdx],ploidy-ac);
|
||||
}
|
||||
|
||||
|
||||
// simulate now pileup with base error rate
|
||||
if (SIMULATE_NOISY_PILEUP) {
|
||||
System.out.format("Depth:%d, AC:%d, altDepth:%d, samplesPerPool:%d\n", depth,ac,altDepth, nSamplesPerPool);
|
||||
|
||||
for (int k=0; k < NUM_SIMULATED_OBS; k++) {
|
||||
final Map<String,AlignmentContext> noisyAlignmentContextMap =
|
||||
readPileupTestProvider.getAlignmentContextFromAlleles(eventLength, new String(new byte[]{altByte}), numReadsPerAllele,
|
||||
true, PHRED_SITE_ERROR_RATE);
|
||||
|
||||
// get now likelihoods for this
|
||||
|
||||
final PoolSNPGenotypeLikelihoods noisyGL = new PoolSNPGenotypeLikelihoods(allAlleles, null, nSamplesPerPool*2, noisyErrorModels, false,true);
|
||||
noisyGL.add(noisyAlignmentContextMap.get("sample0000").getBasePileup(), true, false, UAC.MIN_BASE_QUALTY_SCORE);
|
||||
mlPair = noisyGL.getMostLikelyACCount();
|
||||
|
||||
// Most likely element has to be conformation REF = nSamples-AC,ALT = AC
|
||||
int acEst;
|
||||
if (ac == 0) {
|
||||
acEst = mlPair.first[refIdx];
|
||||
} else {
|
||||
acEst = mlPair.first[altIdx];
|
||||
}
|
||||
byte altEst = BaseUtils.baseIndexToSimpleBase(MathUtils.maxElementIndex(mlPair.first));
|
||||
out.format("%d\t%d\t%d\t%d\t%c\t%c\t%c\n",depth, ploidy, ac, acEst, refByte, altByte, altEst);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
if (SIMULATE_NOISY_PILEUP)
|
||||
out.close();
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,412 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 3/15/12
|
||||
*/
|
||||
|
||||
import net.sf.picard.reference.ReferenceSequenceFile;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Unit tests for GenotypingEngine
|
||||
*/
|
||||
public class GenotypingEngineUnitTest extends BaseTest {
|
||||
|
||||
private static ReferenceSequenceFile seq;
|
||||
private GenomeLocParser genomeLocParser;
|
||||
|
||||
@BeforeClass
|
||||
public void init() throws FileNotFoundException {
|
||||
// sequence
|
||||
seq = new CachingIndexedFastaSequenceFile(new File(b37KGReference));
|
||||
genomeLocParser = new GenomeLocParser(seq);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFindHomVarEventAllelesInSample() {
|
||||
final List<Allele> eventAlleles = new ArrayList<Allele>();
|
||||
eventAlleles.add( Allele.create("A", true) );
|
||||
eventAlleles.add( Allele.create("C", false) );
|
||||
final List<Allele> haplotypeAlleles = new ArrayList<Allele>();
|
||||
haplotypeAlleles.add( Allele.create("AATA", true) );
|
||||
haplotypeAlleles.add( Allele.create("AACA", false) );
|
||||
haplotypeAlleles.add( Allele.create("CATA", false) );
|
||||
haplotypeAlleles.add( Allele.create("CACA", false) );
|
||||
final ArrayList<Haplotype> haplotypes = new ArrayList<Haplotype>();
|
||||
haplotypes.add(new Haplotype("AATA".getBytes()));
|
||||
haplotypes.add(new Haplotype("AACA".getBytes()));
|
||||
haplotypes.add(new Haplotype("CATA".getBytes()));
|
||||
haplotypes.add(new Haplotype("CACA".getBytes()));
|
||||
final List<Allele> haplotypeAllelesForSample = new ArrayList<Allele>();
|
||||
haplotypeAllelesForSample.add( Allele.create("CATA", false) );
|
||||
haplotypeAllelesForSample.add( Allele.create("CACA", false) );
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
|
||||
ArrayList<Haplotype> Aallele = new ArrayList<Haplotype>();
|
||||
Aallele.add(haplotypes.get(0));
|
||||
Aallele.add(haplotypes.get(1));
|
||||
ArrayList<Haplotype> Callele = new ArrayList<Haplotype>();
|
||||
Callele.add(haplotypes.get(2));
|
||||
Callele.add(haplotypes.get(3));
|
||||
alleleMapper.add(Aallele);
|
||||
alleleMapper.add(Callele);
|
||||
final List<Allele> eventAllelesForSample = new ArrayList<Allele>();
|
||||
eventAllelesForSample.add( Allele.create("C", false) );
|
||||
eventAllelesForSample.add( Allele.create("C", false) );
|
||||
|
||||
if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) {
|
||||
logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes));
|
||||
logger.warn("expected alleles = " + eventAllelesForSample);
|
||||
}
|
||||
Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFindHetEventAllelesInSample() {
|
||||
final List<Allele> eventAlleles = new ArrayList<Allele>();
|
||||
eventAlleles.add( Allele.create("A", true) );
|
||||
eventAlleles.add( Allele.create("C", false) );
|
||||
eventAlleles.add( Allele.create("T", false) );
|
||||
final List<Allele> haplotypeAlleles = new ArrayList<Allele>();
|
||||
haplotypeAlleles.add( Allele.create("AATA", true) );
|
||||
haplotypeAlleles.add( Allele.create("AACA", false) );
|
||||
haplotypeAlleles.add( Allele.create("CATA", false) );
|
||||
haplotypeAlleles.add( Allele.create("CACA", false) );
|
||||
haplotypeAlleles.add( Allele.create("TACA", false) );
|
||||
haplotypeAlleles.add( Allele.create("TTCA", false) );
|
||||
haplotypeAlleles.add( Allele.create("TTTA", false) );
|
||||
final ArrayList<Haplotype> haplotypes = new ArrayList<Haplotype>();
|
||||
haplotypes.add(new Haplotype("AATA".getBytes()));
|
||||
haplotypes.add(new Haplotype("AACA".getBytes()));
|
||||
haplotypes.add(new Haplotype("CATA".getBytes()));
|
||||
haplotypes.add(new Haplotype("CACA".getBytes()));
|
||||
haplotypes.add(new Haplotype("TACA".getBytes()));
|
||||
haplotypes.add(new Haplotype("TTCA".getBytes()));
|
||||
haplotypes.add(new Haplotype("TTTA".getBytes()));
|
||||
final List<Allele> haplotypeAllelesForSample = new ArrayList<Allele>();
|
||||
haplotypeAllelesForSample.add( Allele.create("TTTA", false) );
|
||||
haplotypeAllelesForSample.add( Allele.create("AATA", true) );
|
||||
final ArrayList<ArrayList<Haplotype>> alleleMapper = new ArrayList<ArrayList<Haplotype>>();
|
||||
ArrayList<Haplotype> Aallele = new ArrayList<Haplotype>();
|
||||
Aallele.add(haplotypes.get(0));
|
||||
Aallele.add(haplotypes.get(1));
|
||||
ArrayList<Haplotype> Callele = new ArrayList<Haplotype>();
|
||||
Callele.add(haplotypes.get(2));
|
||||
Callele.add(haplotypes.get(3));
|
||||
ArrayList<Haplotype> Tallele = new ArrayList<Haplotype>();
|
||||
Tallele.add(haplotypes.get(4));
|
||||
Tallele.add(haplotypes.get(5));
|
||||
Tallele.add(haplotypes.get(6));
|
||||
alleleMapper.add(Aallele);
|
||||
alleleMapper.add(Callele);
|
||||
alleleMapper.add(Tallele);
|
||||
final List<Allele> eventAllelesForSample = new ArrayList<Allele>();
|
||||
eventAllelesForSample.add( Allele.create("A", true) );
|
||||
eventAllelesForSample.add( Allele.create("T", false) );
|
||||
|
||||
if(!compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes))) {
|
||||
logger.warn("calc alleles = " + GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes));
|
||||
logger.warn("expected alleles = " + eventAllelesForSample);
|
||||
}
|
||||
Assert.assertTrue(compareAlleleLists(eventAllelesForSample, GenotypingEngine.findEventAllelesInSample(eventAlleles, haplotypeAlleles, haplotypeAllelesForSample, alleleMapper, haplotypes)));
|
||||
}
|
||||
|
||||
private boolean compareAlleleLists(List<Allele> l1, List<Allele> l2) {
|
||||
if( l1.size() != l2.size() ) {
|
||||
return false; // sanity check
|
||||
}
|
||||
|
||||
for( int i=0; i < l1.size(); i++ ){
|
||||
if ( !l2.contains(l1.get(i)) )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
private class BasicGenotypingTestProvider extends TestDataProvider {
|
||||
byte[] ref;
|
||||
byte[] hap;
|
||||
HashMap<Integer,Byte> expected;
|
||||
GenotypingEngine ge = new GenotypingEngine(false, 0, false);
|
||||
|
||||
public BasicGenotypingTestProvider(String refString, String hapString, HashMap<Integer, Byte> expected) {
|
||||
super(BasicGenotypingTestProvider.class, String.format("Haplotype to VCF test: ref = %s, alignment = %s", refString,hapString));
|
||||
ref = refString.getBytes();
|
||||
hap = hapString.getBytes();
|
||||
this.expected = expected;
|
||||
}
|
||||
|
||||
public HashMap<Integer,VariantContext> calcAlignment() {
|
||||
final SWPairwiseAlignment alignment = new SWPairwiseAlignment(ref, hap);
|
||||
return ge.generateVCsFromAlignment( alignment.getAlignmentStart2wrt1(), alignment.getCigar(), ref, hap, genomeLocParser.createGenomeLoc("4",1,1+ref.length), "name", 0);
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "BasicGenotypingTestProvider")
|
||||
public Object[][] makeBasicGenotypingTests() {
|
||||
|
||||
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
|
||||
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
|
||||
map.put(1 + contextSize, (byte)'M');
|
||||
final String context = Utils.dupString('G', contextSize);
|
||||
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACTAGCCGATAG", map);
|
||||
}
|
||||
|
||||
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
|
||||
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
|
||||
map.put(2 + contextSize, (byte)'M');
|
||||
map.put(21 + contextSize, (byte)'M');
|
||||
final String context = Utils.dupString('G', contextSize);
|
||||
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG", "ATCTCGCATCGCGAGCATCGCCTAGCCGATAG", map);
|
||||
}
|
||||
|
||||
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
|
||||
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
|
||||
map.put(1 + contextSize, (byte)'M');
|
||||
map.put(20 + contextSize, (byte)'I');
|
||||
final String context = Utils.dupString('G', contextSize);
|
||||
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGACACTAGCCGATAG", map);
|
||||
}
|
||||
|
||||
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
|
||||
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
|
||||
map.put(1 + contextSize, (byte)'M');
|
||||
map.put(20 + contextSize, (byte)'D');
|
||||
final String context = Utils.dupString('G', contextSize);
|
||||
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map);
|
||||
}
|
||||
|
||||
for( int contextSize : new int[]{1,5,9,24,36} ) {
|
||||
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
|
||||
map.put(1, (byte)'M');
|
||||
map.put(20, (byte)'D');
|
||||
final String context = Utils.dupString('G', contextSize);
|
||||
new BasicGenotypingTestProvider("AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCGATAG", map);
|
||||
}
|
||||
|
||||
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
|
||||
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
|
||||
map.put(2 + contextSize, (byte)'M');
|
||||
map.put(20 + contextSize, (byte)'I');
|
||||
map.put(30 + contextSize, (byte)'D');
|
||||
final String context = Utils.dupString('G', contextSize);
|
||||
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "ACCTCGCATCGCGAGCATCGTTACTAGCCGATG", map);
|
||||
}
|
||||
|
||||
for( int contextSize : new int[]{0,1,5,9,24,36} ) {
|
||||
HashMap<Integer, Byte> map = new HashMap<Integer, Byte>();
|
||||
map.put(1 + contextSize, (byte)'M');
|
||||
map.put(20 + contextSize, (byte)'D');
|
||||
map.put(28 + contextSize, (byte)'M');
|
||||
final String context = Utils.dupString('G', contextSize);
|
||||
new BasicGenotypingTestProvider(context + "AGCTCGCATCGCGAGCATCGACTAGCCGATAG" + context, "CGCTCGCATCGCGAGCATCGCTAGCCCATAG", map);
|
||||
}
|
||||
|
||||
return BasicGenotypingTestProvider.getTests(BasicGenotypingTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BasicGenotypingTestProvider", enabled = true)
|
||||
public void testHaplotypeToVCF(BasicGenotypingTestProvider cfg) {
|
||||
HashMap<Integer,VariantContext> calculatedMap = cfg.calcAlignment();
|
||||
HashMap<Integer,Byte> expectedMap = cfg.expected;
|
||||
logger.warn(String.format("Test: %s", cfg.toString()));
|
||||
if(!compareVCMaps(calculatedMap, expectedMap)) {
|
||||
logger.warn("calc map = " + calculatedMap);
|
||||
logger.warn("expected map = " + expectedMap);
|
||||
}
|
||||
Assert.assertTrue(compareVCMaps(calculatedMap, expectedMap));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests that we get the right values from the R^2 calculation
|
||||
*/
|
||||
@Test
|
||||
public void testCalculateR2LD() {
|
||||
logger.warn("Executing testCalculateR2LD");
|
||||
|
||||
Assert.assertEquals(GenotypingEngine.calculateR2LD(1,1,1,1), 0.0, 0.00001);
|
||||
Assert.assertEquals(GenotypingEngine.calculateR2LD(100,100,100,100), 0.0, 0.00001);
|
||||
Assert.assertEquals(GenotypingEngine.calculateR2LD(1,0,0,1), 1.0, 0.00001);
|
||||
Assert.assertEquals(GenotypingEngine.calculateR2LD(100,0,0,100), 1.0, 0.00001);
|
||||
Assert.assertEquals(GenotypingEngine.calculateR2LD(1,2,3,4), (0.1 - 0.12) * (0.1 - 0.12) / (0.3 * 0.7 * 0.4 * 0.6), 0.00001);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateMergedVariantContext() {
|
||||
logger.warn("Executing testCreateMergedVariantContext");
|
||||
|
||||
final byte[] ref = "AATTCCGGAATTCCGGAATT".getBytes();
|
||||
final GenomeLoc refLoc = genomeLocParser.createGenomeLoc("2", 1700, 1700 + ref.length);
|
||||
|
||||
// SNP + SNP = simple MNP
|
||||
VariantContext thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
|
||||
VariantContext nextVC = new VariantContextBuilder().loc("2", 1704, 1704).alleles("C","G").make();
|
||||
VariantContext truthVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","GG").source("merged").make();
|
||||
VariantContext mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// SNP + ref + SNP = MNP with ref base gap
|
||||
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCG").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// insertion + SNP
|
||||
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","AAAAA").referenceBaseForIndel("T").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TAAAAACG").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// SNP + insertion
|
||||
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("-","AAAAA").referenceBaseForIndel("C").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","GCCAAAAA").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// deletion + SNP
|
||||
thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("C","-").referenceBaseForIndel("T").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("C","G").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TG").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// SNP + deletion
|
||||
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("T","G").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","GCC").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// insertion + deletion = MNP
|
||||
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","A").referenceBaseForIndel("T").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1704, 1706).alleles("CCG","ACC").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// insertion + deletion
|
||||
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","AAAAA").referenceBaseForIndel("T").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1703, 1706).alleles("TCCG","TAAAAACC").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// insertion + insertion
|
||||
thisVC = new VariantContextBuilder().loc("2", 1703, 1703).alleles("-","A").referenceBaseForIndel("T").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1705, 1705).alleles("-","A").referenceBaseForIndel("C").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1703, 1705).alleles("TCC","TACCA").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// deletion + deletion
|
||||
thisVC = new VariantContextBuilder().loc("2", 1701, 1702).alleles("T","-").referenceBaseForIndel("A").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1705, 1706).alleles("G","-").referenceBaseForIndel("C").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1701, 1706).alleles("ATTCCG","ATCC").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
|
||||
// complex + complex
|
||||
thisVC = new VariantContextBuilder().loc("2", 1703, 1704).alleles("TC","AAA").make();
|
||||
nextVC = new VariantContextBuilder().loc("2", 1706, 1707).alleles("GG","AC").make();
|
||||
truthVC = new VariantContextBuilder().loc("2", 1703, 1707).alleles("TCCGG","AAACAC").source("merged").make();
|
||||
mergedVC = GenotypingEngine.createMergedVariantContext(thisVC, nextVC, ref, refLoc);
|
||||
logger.warn(truthVC + " == " + mergedVC);
|
||||
Assert.assertTrue(truthVC.hasSameAllelesAs(mergedVC));
|
||||
Assert.assertEquals(truthVC.getStart(), mergedVC.getStart());
|
||||
Assert.assertEquals(truthVC.getEnd(), mergedVC.getEnd());
|
||||
Assert.assertEquals(truthVC.hasReferenceBaseForIndel(), mergedVC.hasReferenceBaseForIndel());
|
||||
Assert.assertEquals(truthVC.getReferenceBaseForIndel(), mergedVC.getReferenceBaseForIndel());
|
||||
}
|
||||
|
||||
/**
|
||||
* Private function to compare HashMap of VCs, it only checks the types and start locations of the VariantContext
|
||||
*/
|
||||
private boolean compareVCMaps(HashMap<Integer, VariantContext> calc, HashMap<Integer, Byte> expected) {
|
||||
if( !calc.keySet().equals(expected.keySet()) ) { return false; } // sanity check
|
||||
for( Integer loc : expected.keySet() ) {
|
||||
Byte type = expected.get(loc);
|
||||
switch( type ) {
|
||||
case 'I':
|
||||
if( !calc.get(loc).isSimpleInsertion() ) { return false; }
|
||||
break;
|
||||
case 'D':
|
||||
if( !calc.get(loc).isSimpleDeletion() ) { return false; }
|
||||
break;
|
||||
case 'M':
|
||||
if( !(calc.get(loc).isMNP() || calc.get(loc).isSNP()) ) { return false; }
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
import org.broadinstitute.sting.WalkerTest;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
public class HaplotypeCallerIntegrationTest extends WalkerTest {
|
||||
final static String REF = b37KGReference;
|
||||
final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
|
||||
final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
|
||||
//final String RECAL_FILE = validationDataLocation + "NA12878.kmer.8.subset.recal_data.bqsr";
|
||||
|
||||
private void HCTest(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCaller: args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSample() {
|
||||
HCTest(CEUTRIO_BAM, "", "7b4e76934e0c911220b4e7da8776ab2b");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerSingleSample() {
|
||||
HCTest(NA12878_BAM, "", "fcf0cea98a571d5e2d1dfa8b5edc599d");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleGGA() {
|
||||
HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "ff370c42c8b09a29f1aeff5ac57c7ea6");
|
||||
}
|
||||
|
||||
private void HCTestComplexVariants(String bam, String args, String md5) {
|
||||
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 3";
|
||||
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
|
||||
executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHaplotypeCallerMultiSampleComplex() {
|
||||
HCTestComplexVariants(CEUTRIO_BAM, "", "6f9fda3ea82c5696bed1d48ee90cd76b");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,173 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 3/14/12
|
||||
*/
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Unit tests for LikelihoodCalculationEngine
|
||||
*/
|
||||
public class LikelihoodCalculationEngineUnitTest extends BaseTest {
|
||||
|
||||
@Test
|
||||
public void testNormalizeDiploidLikelihoodMatrixFromLog10() {
|
||||
double[][] likelihoodMatrix = {
|
||||
{-90.2, 0, 0},
|
||||
{-190.1, -2.1, 0},
|
||||
{-7.0, -17.5, -35.9}
|
||||
};
|
||||
double[][] normalizedMatrix = {
|
||||
{-88.1, 0, 0},
|
||||
{-188.0, 0.0, 0},
|
||||
{-4.9, -15.4, -33.8}
|
||||
};
|
||||
|
||||
|
||||
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix), normalizedMatrix));
|
||||
|
||||
double[][] likelihoodMatrix2 = {
|
||||
{-90.2, 0, 0, 0},
|
||||
{-190.1, -2.1, 0, 0},
|
||||
{-7.0, -17.5, -35.9, 0},
|
||||
{-7.0, -17.5, -35.9, -1000.0},
|
||||
};
|
||||
double[][] normalizedMatrix2 = {
|
||||
{-88.1, 0, 0, 0},
|
||||
{-188.0, 0.0, 0, 0},
|
||||
{-4.9, -15.4, -33.8, 0},
|
||||
{-4.9, -15.4, -33.8, -997.9},
|
||||
};
|
||||
Assert.assertTrue(compareDoubleArrays(LikelihoodCalculationEngine.normalizeDiploidLikelihoodMatrixFromLog10(likelihoodMatrix2), normalizedMatrix2));
|
||||
}
|
||||
|
||||
private class BasicLikelihoodTestProvider extends TestDataProvider {
|
||||
public Double readLikelihoodForHaplotype1;
|
||||
public Double readLikelihoodForHaplotype2;
|
||||
public Double readLikelihoodForHaplotype3;
|
||||
|
||||
public BasicLikelihoodTestProvider(double a, double b) {
|
||||
super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f",a,b));
|
||||
readLikelihoodForHaplotype1 = a;
|
||||
readLikelihoodForHaplotype2 = b;
|
||||
readLikelihoodForHaplotype3 = null;
|
||||
}
|
||||
|
||||
public BasicLikelihoodTestProvider(double a, double b, double c) {
|
||||
super(BasicLikelihoodTestProvider.class, String.format("Diploid haplotype likelihoods for reads %f / %f / %f",a,b,c));
|
||||
readLikelihoodForHaplotype1 = a;
|
||||
readLikelihoodForHaplotype2 = b;
|
||||
readLikelihoodForHaplotype3 = c;
|
||||
}
|
||||
|
||||
public double[][] expectedDiploidHaplotypeMatrix() {
|
||||
if( readLikelihoodForHaplotype3 == null ) {
|
||||
double maxValue = Math.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2);
|
||||
double[][] normalizedMatrix = {
|
||||
{readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY},
|
||||
{Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue}
|
||||
};
|
||||
return normalizedMatrix;
|
||||
} else {
|
||||
double maxValue = MathUtils.max(readLikelihoodForHaplotype1,readLikelihoodForHaplotype2,readLikelihoodForHaplotype3);
|
||||
double[][] normalizedMatrix = {
|
||||
{readLikelihoodForHaplotype1 - maxValue, Double.NEGATIVE_INFINITY, Double.NEGATIVE_INFINITY},
|
||||
{Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype2)) - maxValue, readLikelihoodForHaplotype2 - maxValue, Double.NEGATIVE_INFINITY},
|
||||
{Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype1) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue,
|
||||
Math.log10(0.5*Math.pow(10,readLikelihoodForHaplotype2) + 0.5*Math.pow(10,readLikelihoodForHaplotype3)) - maxValue, readLikelihoodForHaplotype3 - maxValue}
|
||||
};
|
||||
return normalizedMatrix;
|
||||
}
|
||||
}
|
||||
|
||||
public double[][] calcDiploidHaplotypeMatrix() {
|
||||
ArrayList<Haplotype> haplotypes = new ArrayList<Haplotype>();
|
||||
for( int iii = 1; iii <= 3; iii++) {
|
||||
Double readLikelihood = ( iii == 1 ? readLikelihoodForHaplotype1 : ( iii == 2 ? readLikelihoodForHaplotype2 : readLikelihoodForHaplotype3) );
|
||||
if( readLikelihood != null ) {
|
||||
Haplotype haplotype = new Haplotype( (iii == 1 ? "AAAA" : (iii == 2 ? "CCCC" : "TTTT")).getBytes() );
|
||||
haplotype.addReadLikelihoods("myTestSample", new double[]{readLikelihood});
|
||||
haplotypes.add(haplotype);
|
||||
}
|
||||
}
|
||||
return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample");
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "BasicLikelihoodTestProvider")
|
||||
public Object[][] makeBasicLikelihoodTests() {
|
||||
new BasicLikelihoodTestProvider(-1.1, -2.2);
|
||||
new BasicLikelihoodTestProvider(-2.2, -1.1);
|
||||
new BasicLikelihoodTestProvider(-1.1, -1.1);
|
||||
new BasicLikelihoodTestProvider(-9.7, -15.0);
|
||||
new BasicLikelihoodTestProvider(-1.1, -2000.2);
|
||||
new BasicLikelihoodTestProvider(-1000.1, -2.2);
|
||||
new BasicLikelihoodTestProvider(0, 0);
|
||||
new BasicLikelihoodTestProvider(-1.1, 0);
|
||||
new BasicLikelihoodTestProvider(0, -2.2);
|
||||
new BasicLikelihoodTestProvider(-100.1, -200.2);
|
||||
|
||||
new BasicLikelihoodTestProvider(-1.1, -2.2, 0);
|
||||
new BasicLikelihoodTestProvider(-2.2, -1.1, 0);
|
||||
new BasicLikelihoodTestProvider(-1.1, -1.1, 0);
|
||||
new BasicLikelihoodTestProvider(-9.7, -15.0, 0);
|
||||
new BasicLikelihoodTestProvider(-1.1, -2000.2, 0);
|
||||
new BasicLikelihoodTestProvider(-1000.1, -2.2, 0);
|
||||
new BasicLikelihoodTestProvider(0, 0, 0);
|
||||
new BasicLikelihoodTestProvider(-1.1, 0, 0);
|
||||
new BasicLikelihoodTestProvider(0, -2.2, 0);
|
||||
new BasicLikelihoodTestProvider(-100.1, -200.2, 0);
|
||||
|
||||
new BasicLikelihoodTestProvider(-1.1, -2.2, -12.121);
|
||||
new BasicLikelihoodTestProvider(-2.2, -1.1, -12.121);
|
||||
new BasicLikelihoodTestProvider(-1.1, -1.1, -12.121);
|
||||
new BasicLikelihoodTestProvider(-9.7, -15.0, -12.121);
|
||||
new BasicLikelihoodTestProvider(-1.1, -2000.2, -12.121);
|
||||
new BasicLikelihoodTestProvider(-1000.1, -2.2, -12.121);
|
||||
new BasicLikelihoodTestProvider(0, 0, -12.121);
|
||||
new BasicLikelihoodTestProvider(-1.1, 0, -12.121);
|
||||
new BasicLikelihoodTestProvider(0, -2.2, -12.121);
|
||||
new BasicLikelihoodTestProvider(-100.1, -200.2, -12.121);
|
||||
|
||||
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
|
||||
public void testOneReadWithTwoOrThreeHaplotypes(BasicLikelihoodTestProvider cfg) {
|
||||
double[][] calculatedMatrix = cfg.calcDiploidHaplotypeMatrix();
|
||||
double[][] expectedMatrix = cfg.expectedDiploidHaplotypeMatrix();
|
||||
logger.warn(String.format("Test: %s", cfg.toString()));
|
||||
Assert.assertTrue(compareDoubleArrays(calculatedMatrix, expectedMatrix));
|
||||
}
|
||||
|
||||
/**
|
||||
* Private function to compare 2d arrays
|
||||
*/
|
||||
private boolean compareDoubleArrays(double[][] b1, double[][] b2) {
|
||||
if( b1.length != b2.length ) {
|
||||
return false; // sanity check
|
||||
}
|
||||
|
||||
for( int i=0; i < b1.length; i++ ){
|
||||
if( b1[i].length != b2[i].length) {
|
||||
return false; // sanity check
|
||||
}
|
||||
for( int j=0; j < b1.length; j++ ){
|
||||
if ( MathUtils.compareDoubles(b1[i][j], b2[i][j]) != 0 && !Double.isInfinite(b1[i][j]) && !Double.isInfinite(b2[i][j]))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,257 @@
|
|||
package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: 3/27/12
|
||||
*/
|
||||
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.Haplotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.Allele;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
import org.testng.Assert;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.DataProvider;
|
||||
import org.testng.annotations.Test;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.*;
|
||||
|
||||
public class SimpleDeBruijnAssemblerUnitTest extends BaseTest {
|
||||
|
||||
|
||||
private class MergeNodesWithNoVariationTestProvider extends TestDataProvider {
|
||||
public byte[] sequence;
|
||||
public int KMER_LENGTH;
|
||||
|
||||
public MergeNodesWithNoVariationTestProvider(String seq, int kmer) {
|
||||
super(MergeNodesWithNoVariationTestProvider.class, String.format("Merge nodes with no variation test. kmer = %d, seq = %s", kmer, seq));
|
||||
sequence = seq.getBytes();
|
||||
KMER_LENGTH = kmer;
|
||||
}
|
||||
|
||||
public DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> expectedGraph() {
|
||||
DeBruijnVertex v = new DeBruijnVertex(sequence, 0);
|
||||
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
graph.addVertex(v);
|
||||
return graph;
|
||||
}
|
||||
|
||||
public DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> calcGraph() {
|
||||
|
||||
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
|
||||
for (int i = 0; i < kmersInSequence - 1; i++) {
|
||||
// get the kmers
|
||||
final byte[] kmer1 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(sequence, i, kmer1, 0, KMER_LENGTH);
|
||||
final byte[] kmer2 = new byte[KMER_LENGTH];
|
||||
System.arraycopy(sequence, i+1, kmer2, 0, KMER_LENGTH);
|
||||
|
||||
SimpleDeBruijnAssembler.addKmersToGraph(graph, kmer1, kmer2, false);
|
||||
}
|
||||
SimpleDeBruijnAssembler.mergeNodes(graph);
|
||||
return graph;
|
||||
}
|
||||
}
|
||||
|
||||
@DataProvider(name = "MergeNodesWithNoVariationTestProvider")
|
||||
public Object[][] makeMergeNodesWithNoVariationTests() {
|
||||
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 3);
|
||||
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 4);
|
||||
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 5);
|
||||
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 6);
|
||||
new MergeNodesWithNoVariationTestProvider("GGTTAACC", 7);
|
||||
new MergeNodesWithNoVariationTestProvider("GGTTAACCATGCAGACGGGAGGCTGAGCGAGAGTTTT", 6);
|
||||
new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 66);
|
||||
new MergeNodesWithNoVariationTestProvider("AATACCATTGGAGTTTTTTTCCAGGTTAAGATGGTGCATTGAATCCACCCATCTACTTTTGCTCCTCCCAAAACTCACTAAAACTATTATAAAGGGATTTTGTTTAAAGACACAAACTCATGAGGACAGAGAGAACAGAGTAGACAATAGTGGGGGAAAAATAAGTTGGAAGATAGAAAACAGATGGGTGAGTGGTAATCGACTCAGCAGCCCCAAGAAAGCTGAAACCCAGGGAAAGTTAAGAGTAGCCCTATTTTCATGGCAAAATCCAAGGGGGGGTGGGGAAAGAAAGAAAAACAGAAAAAAAAATGGGAATTGGCAGTCCTAGATATCTCTGGTACTGGGCAAGCCAAAGAATCAGGATAACTGGGTGAAAGGTGATTGGGAAGCAGTTAAAATCTTAGTTCCCCTCTTCCACTCTCCGAGCAGCAGGTTTCTCTCTCTCATCAGGCAGAGGGCTGGAGAT", 76);
|
||||
|
||||
return MergeNodesWithNoVariationTestProvider.getTests(MergeNodesWithNoVariationTestProvider.class);
|
||||
}
|
||||
|
||||
@Test(dataProvider = "MergeNodesWithNoVariationTestProvider", enabled = true)
|
||||
public void testMergeNodesWithNoVariation(MergeNodesWithNoVariationTestProvider cfg) {
|
||||
logger.warn(String.format("Test: %s", cfg.toString()));
|
||||
Assert.assertTrue(graphEquals(cfg.calcGraph(), cfg.expectedGraph()));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testPruneGraph() {
|
||||
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
|
||||
DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 0);
|
||||
DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 0);
|
||||
DeBruijnVertex v3 = new DeBruijnVertex("ATGGT".getBytes(), 0);
|
||||
DeBruijnVertex v4 = new DeBruijnVertex("ATGGG".getBytes(), 0);
|
||||
DeBruijnVertex v5 = new DeBruijnVertex("ATGGC".getBytes(), 0);
|
||||
DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC".getBytes(), 0);
|
||||
|
||||
graph.addVertex(v);
|
||||
graph.addVertex(v2);
|
||||
graph.addVertex(v3);
|
||||
graph.addVertex(v4);
|
||||
graph.addVertex(v5);
|
||||
graph.addVertex(v6);
|
||||
graph.addEdge(v, v2, new DeBruijnEdge(false, 1));
|
||||
graph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
|
||||
graph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
|
||||
graph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
|
||||
graph.addEdge(v5, v6, new DeBruijnEdge(false, 2));
|
||||
|
||||
expectedGraph.addVertex(v2);
|
||||
expectedGraph.addVertex(v3);
|
||||
expectedGraph.addVertex(v4);
|
||||
expectedGraph.addVertex(v5);
|
||||
expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
|
||||
expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
|
||||
expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
|
||||
|
||||
SimpleDeBruijnAssembler.pruneGraph(graph, 2);
|
||||
|
||||
Assert.assertTrue(graphEquals(graph, expectedGraph));
|
||||
|
||||
graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
|
||||
graph.addVertex(v);
|
||||
graph.addVertex(v2);
|
||||
graph.addVertex(v3);
|
||||
graph.addVertex(v4);
|
||||
graph.addVertex(v5);
|
||||
graph.addVertex(v6);
|
||||
graph.addEdge(v, v2, new DeBruijnEdge(true, 1));
|
||||
graph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
|
||||
graph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
|
||||
graph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
|
||||
|
||||
expectedGraph.addVertex(v);
|
||||
expectedGraph.addVertex(v2);
|
||||
expectedGraph.addVertex(v3);
|
||||
expectedGraph.addVertex(v4);
|
||||
expectedGraph.addVertex(v5);
|
||||
expectedGraph.addEdge(v, v2, new DeBruijnEdge(true, 1));
|
||||
expectedGraph.addEdge(v2, v3, new DeBruijnEdge(false, 3));
|
||||
expectedGraph.addEdge(v3, v4, new DeBruijnEdge(false, 5));
|
||||
expectedGraph.addEdge(v4, v5, new DeBruijnEdge(false, 3));
|
||||
|
||||
SimpleDeBruijnAssembler.pruneGraph(graph, 2);
|
||||
|
||||
Assert.assertTrue(graphEquals(graph, expectedGraph));
|
||||
}
|
||||
|
||||
@Test(enabled = true)
|
||||
public void testEliminateNonRefPaths() {
|
||||
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
|
||||
DeBruijnVertex v = new DeBruijnVertex("ATGG".getBytes(), 0);
|
||||
DeBruijnVertex v2 = new DeBruijnVertex("ATGGA".getBytes(), 0);
|
||||
DeBruijnVertex v3 = new DeBruijnVertex("ATGGT".getBytes(), 0);
|
||||
DeBruijnVertex v4 = new DeBruijnVertex("ATGGG".getBytes(), 0);
|
||||
DeBruijnVertex v5 = new DeBruijnVertex("ATGGC".getBytes(), 0);
|
||||
DeBruijnVertex v6 = new DeBruijnVertex("ATGGCCCCCC".getBytes(), 0);
|
||||
|
||||
graph.addVertex(v);
|
||||
graph.addVertex(v2);
|
||||
graph.addVertex(v3);
|
||||
graph.addVertex(v4);
|
||||
graph.addVertex(v5);
|
||||
graph.addVertex(v6);
|
||||
graph.addEdge(v, v2, new DeBruijnEdge(false));
|
||||
graph.addEdge(v2, v3, new DeBruijnEdge(true));
|
||||
graph.addEdge(v3, v4, new DeBruijnEdge(true));
|
||||
graph.addEdge(v4, v5, new DeBruijnEdge(true));
|
||||
graph.addEdge(v5, v6, new DeBruijnEdge(false));
|
||||
|
||||
expectedGraph.addVertex(v2);
|
||||
expectedGraph.addVertex(v3);
|
||||
expectedGraph.addVertex(v4);
|
||||
expectedGraph.addVertex(v5);
|
||||
expectedGraph.addEdge(v2, v3, new DeBruijnEdge());
|
||||
expectedGraph.addEdge(v3, v4, new DeBruijnEdge());
|
||||
expectedGraph.addEdge(v4, v5, new DeBruijnEdge());
|
||||
|
||||
SimpleDeBruijnAssembler.eliminateNonRefPaths(graph);
|
||||
|
||||
Assert.assertTrue(graphEquals(graph, expectedGraph));
|
||||
|
||||
|
||||
|
||||
|
||||
graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
|
||||
graph.addVertex(v);
|
||||
graph.addVertex(v2);
|
||||
graph.addVertex(v3);
|
||||
graph.addVertex(v4);
|
||||
graph.addVertex(v5);
|
||||
graph.addVertex(v6);
|
||||
graph.addEdge(v, v2, new DeBruijnEdge(true));
|
||||
graph.addEdge(v2, v3, new DeBruijnEdge(true));
|
||||
graph.addEdge(v4, v5, new DeBruijnEdge(false));
|
||||
graph.addEdge(v5, v6, new DeBruijnEdge(false));
|
||||
|
||||
expectedGraph.addVertex(v);
|
||||
expectedGraph.addVertex(v2);
|
||||
expectedGraph.addVertex(v3);
|
||||
expectedGraph.addEdge(v, v2, new DeBruijnEdge());
|
||||
expectedGraph.addEdge(v2, v3, new DeBruijnEdge());
|
||||
|
||||
SimpleDeBruijnAssembler.eliminateNonRefPaths(graph);
|
||||
|
||||
Assert.assertTrue(graphEquals(graph, expectedGraph));
|
||||
|
||||
|
||||
|
||||
graph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
expectedGraph = new DefaultDirectedGraph<DeBruijnVertex, DeBruijnEdge>(DeBruijnEdge.class);
|
||||
|
||||
graph.addVertex(v);
|
||||
graph.addVertex(v2);
|
||||
graph.addVertex(v3);
|
||||
graph.addVertex(v4);
|
||||
graph.addVertex(v5);
|
||||
graph.addVertex(v6);
|
||||
graph.addEdge(v, v2, new DeBruijnEdge(true));
|
||||
graph.addEdge(v2, v3, new DeBruijnEdge(true));
|
||||
graph.addEdge(v4, v5, new DeBruijnEdge(false));
|
||||
graph.addEdge(v5, v6, new DeBruijnEdge(false));
|
||||
graph.addEdge(v4, v2, new DeBruijnEdge(false));
|
||||
|
||||
expectedGraph.addVertex(v);
|
||||
expectedGraph.addVertex(v2);
|
||||
expectedGraph.addVertex(v3);
|
||||
expectedGraph.addEdge(v, v2, new DeBruijnEdge());
|
||||
expectedGraph.addEdge(v2, v3, new DeBruijnEdge());
|
||||
|
||||
SimpleDeBruijnAssembler.eliminateNonRefPaths(graph);
|
||||
|
||||
Assert.assertTrue(graphEquals(graph, expectedGraph));
|
||||
}
|
||||
|
||||
private boolean graphEquals(DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> g1, DefaultDirectedGraph<DeBruijnVertex,DeBruijnEdge> g2) {
|
||||
if( !(g1.vertexSet().containsAll(g2.vertexSet()) && g2.vertexSet().containsAll(g1.vertexSet())) ) {
|
||||
return false;
|
||||
}
|
||||
for( DeBruijnEdge e1 : g1.edgeSet() ) {
|
||||
boolean found = false;
|
||||
for( DeBruijnEdge e2 : g2.edgeSet() ) {
|
||||
if( e1.equals(g1, e2, g2) ) { found = true; break; }
|
||||
}
|
||||
if( !found ) { return false; }
|
||||
}
|
||||
for( DeBruijnEdge e2 : g2.edgeSet() ) {
|
||||
boolean found = false;
|
||||
for( DeBruijnEdge e1 : g1.edgeSet() ) {
|
||||
if( e2.equals(g2, e1, g1) ) { found = true; break; }
|
||||
}
|
||||
if( !found ) { return false; }
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,121 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
library(tools)
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
verbose = TRUE
|
||||
|
||||
input = args[1]
|
||||
covariateName = args[2]
|
||||
|
||||
outfile = paste(input, ".qual_diff_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
c <- read.table(input, header=T)
|
||||
c <- c[sort.list(c[,1]),]
|
||||
|
||||
#
|
||||
# Plot residual error as a function of the covariate
|
||||
#
|
||||
|
||||
d.good <- c[c$nBases >= 1000,]
|
||||
d.1000 <- c[c$nBases < 1000,]
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((c$Qempirical-c$Qreported)^2 * c$nBases)) / sum(as.numeric(c$nBases)) )
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(d.good$nBases) == length(c$nBases) ) {
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3))
|
||||
}
|
||||
# Don't let residual error go off the edge of the plot
|
||||
d.good$residualError = d.good$Qempirical-d.good$Qreported
|
||||
d.good$residualError[which(d.good$residualError > 10)] = 10
|
||||
d.good$residualError[which(d.good$residualError < -10)] = -10
|
||||
d.1000$residualError = d.1000$Qempirical-d.1000$Qreported
|
||||
d.1000$residualError[which(d.1000$residualError > 10)] = 10
|
||||
d.1000$residualError[which(d.1000$residualError < -10)] = -10
|
||||
c$residualError = c$Qempirical-c$Qreported
|
||||
c$residualError[which(c$residualError > 10)] = 10
|
||||
c$residualError[which(c$residualError < -10)] = -10
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$residualError, type=pointType, main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(-10, 10), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$residualError, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$residualError, type="l", main=theTitle, ylab="Empirical - Reported Quality", xlab=covariateName, col="blue", ylim=c(-10, 10))
|
||||
points(d.1000$Covariate, d.1000$residualError, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
if (exists('compactPDF')) {
|
||||
compactPDF(outfile)
|
||||
}
|
||||
|
||||
#
|
||||
# Plot mean quality versus the covariate
|
||||
#
|
||||
|
||||
outfile = paste(input, ".reported_qual_v_", covariateName, ".pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
par(cex=1.1)
|
||||
pointType = "p"
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
pointType = "o"
|
||||
}
|
||||
theTitle = paste("Quality By", covariateName);
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
plot(d.good$Covariate, d.good$Qreported, type=pointType, main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", pch=20, ylim=c(0, 40), xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type=pointType, col="cornflowerblue", pch=20)
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
plot(c$Covariate, c$Qreported, type="l", main=theTitle, ylab="Mean Reported Quality", xlab=covariateName, col="blue", ylim=c(0, 40))
|
||||
points(d.1000$Covariate, d.1000$Qreported, type="l", col="cornflowerblue")
|
||||
}
|
||||
dev.off()
|
||||
|
||||
if (exists('compactPDF')) {
|
||||
compactPDF(outfile)
|
||||
}
|
||||
|
||||
#
|
||||
# Plot histogram of the covariate
|
||||
#
|
||||
|
||||
e = d.good
|
||||
f = d.1000
|
||||
outfile = paste(input, ".", covariateName,"_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Covariate, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Covariate, f$nBases), f.nBases != 0)
|
||||
|
||||
lwdSize=2
|
||||
if( length(c$Covariate) <= 20 ) {
|
||||
lwdSize=7
|
||||
} else if( length(c$Covariate) <= 70 ) {
|
||||
lwdSize=4
|
||||
}
|
||||
|
||||
if( is.numeric(c$Covariate) ) {
|
||||
if( length(hst$e.Covariate) == 0 ) {
|
||||
plot(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue", main=paste(covariateName,"histogram"), ylim=c(0, max(hst2$f.nBases)), xlab=covariateName, ylab="Count",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
} else {
|
||||
plot(hst$e.Covariate, hst$e.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), xlab=covariateName, ylim=c(0, max(hst$e.nBases)),ylab="Number of Bases",yaxt="n",xlim=c(min(c$Covariate),max(c$Covariate)))
|
||||
points(hst2$f.Covariate, hst2$f.nBases, type="h", lwd=lwdSize, col="cornflowerblue")
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
} else { # Dinuc (and other non-numeric covariates) are different to make their plots look nice
|
||||
hst=subset(data.frame(c$Covariate, c$nBases), c.nBases != 0)
|
||||
plot(1:length(hst$c.Covariate), hst$c.nBases, type="h", lwd=lwdSize, main=paste(covariateName,"histogram"), ylim=c(0, max(hst$c.nBases)),xlab=covariateName, ylab="Number of Bases",yaxt="n",xaxt="n")
|
||||
if( length(hst$c.Covariate) > 9 ) {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),2), labels = hst$c.Covariate[seq(1,length(hst$c.Covariate),2)])
|
||||
} else {
|
||||
axis(1, at=seq(1,length(hst$c.Covariate),1), labels = hst$c.Covariate)
|
||||
}
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
}
|
||||
dev.off()
|
||||
|
||||
if (exists('compactPDF')) {
|
||||
compactPDF(outfile)
|
||||
}
|
||||
|
|
@ -1,84 +0,0 @@
|
|||
#!/bin/env Rscript
|
||||
|
||||
library(tools)
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
|
||||
input = args[1]
|
||||
Qcutoff = as.numeric(args[2])
|
||||
maxQ = as.numeric(args[3])
|
||||
maxHist = as.numeric(args[4])
|
||||
|
||||
t=read.table(input, header=T)
|
||||
|
||||
#
|
||||
# Plot of reported quality versus empirical quality
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_v_stated.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
d.good <- t[t$nBases >= 10000 & t$Qreported >= Qcutoff,]
|
||||
d.1000 <- t[t$nBases < 1000 & t$Qreported >= Qcutoff,]
|
||||
d.10000 <- t[t$nBases < 10000 & t$nBases >= 1000 & t$Qreported >= Qcutoff,]
|
||||
f <- t[t$Qreported < Qcutoff,]
|
||||
e <- rbind(d.good, d.1000, d.10000)
|
||||
rmseGood = sqrt( sum(as.numeric((d.good$Qempirical-d.good$Qreported)^2 * d.good$nBases)) / sum(as.numeric(d.good$nBases)) ) # prevent integer overflow with as.numeric, ugh
|
||||
rmseAll = sqrt( sum(as.numeric((e$Qempirical-e$Qreported)^2 * e$nBases)) / sum(as.numeric(e$nBases)) )
|
||||
theTitle = paste("RMSE_good =", round(rmseGood,digits=3), ", RMSE_all =", round(rmseAll,digits=3))
|
||||
if( length(t$nBases) - length(f$nBases) == length(d.good$nBases) ) {
|
||||
theTitle = paste("RMSE =", round(rmseAll,digits=3));
|
||||
}
|
||||
plot(d.good$Qreported, d.good$Qempirical, type="p", col="blue", main=theTitle, xlim=c(0,maxQ), ylim=c(0,maxQ), pch=16, xlab="Reported quality score", ylab="Empirical quality score")
|
||||
points(d.1000$Qreported, d.1000$Qempirical, type="p", col="lightblue", pch=16)
|
||||
points(d.10000$Qreported, d.10000$Qempirical, type="p", col="cornflowerblue", pch=16)
|
||||
points(f$Qreported, f$Qempirical, type="p", col="maroon1", pch=16)
|
||||
abline(0,1, lty=2)
|
||||
dev.off()
|
||||
|
||||
if (exists('compactPDF')) {
|
||||
compactPDF(outfile)
|
||||
}
|
||||
|
||||
#
|
||||
# Plot Q empirical histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_emp_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qempirical, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qempirical, f$nBases), f.nBases != 0)
|
||||
percentBases=hst$e.nBases / sum(as.numeric(hst$e.nBases))
|
||||
entropy = -sum(log2(percentBases)*percentBases)
|
||||
yMax = max(hst$e.nBases)
|
||||
if(maxHist != 0) {
|
||||
yMax = maxHist
|
||||
}
|
||||
plot(hst$e.Qempirical, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Empirical quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Empirical quality score", ylab="Number of Bases",yaxt="n")
|
||||
points(hst2$f.Qempirical, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
||||
if (exists('compactPDF')) {
|
||||
compactPDF(outfile)
|
||||
}
|
||||
|
||||
#
|
||||
# Plot Q reported histogram
|
||||
#
|
||||
|
||||
outfile = paste(input, ".quality_rep_hist.pdf", sep="")
|
||||
pdf(outfile, height=7, width=7)
|
||||
hst=subset(data.frame(e$Qreported, e$nBases), e.nBases != 0)
|
||||
hst2=subset(data.frame(f$Qreported, f$nBases), f.nBases != 0)
|
||||
yMax = max(hst$e.nBases)
|
||||
if(maxHist != 0) {
|
||||
yMax = maxHist
|
||||
}
|
||||
plot(hst$e.Qreported, hst$e.nBases, type="h", lwd=4, xlim=c(0,maxQ), ylim=c(0,yMax), main=paste("Reported quality score histogram, entropy = ",round(entropy,digits=3)), xlab="Reported quality score", ylab="Number of Bases",yaxt="n")
|
||||
points(hst2$f.Qreported, hst2$f.nBases, type="h", lwd=4, col="maroon1")
|
||||
axis(2,axTicks(2), format(axTicks(2), scientific=F))
|
||||
dev.off()
|
||||
|
||||
if (exists('compactPDF')) {
|
||||
compactPDF(outfile)
|
||||
}
|
||||
|
|
@ -0,0 +1,100 @@
|
|||
library("ggplot2")
|
||||
|
||||
args <- commandArgs(TRUE)
|
||||
data <- read.csv(args[1])
|
||||
data <- within(data, EventType <- factor(EventType, levels = rev(levels(EventType))))
|
||||
|
||||
numRG = length(unique(data$ReadGroup))
|
||||
blankTheme = opts(panel.grid.major = theme_blank(), panel.grid.minor = theme_blank(), panel.background = theme_blank(), axis.ticks = theme_blank())
|
||||
|
||||
# Viewport (layout 2 graphs top to bottom)
|
||||
distributeGraphRows <- function(graphs, heights = c()) {
|
||||
if (length(heights) == 0) {
|
||||
heights <- rep.int(1, length(graphs))
|
||||
}
|
||||
heights <- heights[!is.na(graphs)]
|
||||
graphs <- graphs[!is.na(graphs)]
|
||||
numGraphs <- length(graphs)
|
||||
Layout <- grid.layout(nrow = numGraphs, ncol = 1, heights=heights)
|
||||
grid.newpage()
|
||||
pushViewport(viewport(layout = Layout))
|
||||
subplot <- function(x) viewport(layout.pos.row = x, layout.pos.col = 1)
|
||||
for (i in 1:numGraphs) {
|
||||
print(graphs[[i]], vp = subplot(i))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
for(cov in levels(data$CovariateName)) { # for each covariate in turn
|
||||
d = data[data$CovariateName==cov,] # pull out just the data for this covariate so we can treat the non-numeric values appropriately
|
||||
if( cov == "Context" ) {
|
||||
d$CovariateValue = as.character(d$CovariateValue)
|
||||
d$CovariateValue = substring(d$CovariateValue,nchar(d$CovariateValue)-2,nchar(d$CovariateValue))
|
||||
} else {
|
||||
d$CovariateValue = as.numeric(levels(d$CovariateValue))[as.integer(d$CovariateValue)] # efficient way to convert factors back to their real values
|
||||
}
|
||||
#d=subset(d,Observations>2000) # only show bins which have enough data to actually estimate the quality
|
||||
dSub=subset(d,EventType=="Base Substitution")
|
||||
dIns=subset(d,EventType=="Base Insertion")
|
||||
dDel=subset(d,EventType=="Base Deletion")
|
||||
dSub=dSub[sample.int(length(dSub[,1]),min(length(dSub[,1]),2000)),] # don't plot too many values because it makes the PDFs too massive
|
||||
dIns=dIns[sample.int(length(dIns[,1]),min(length(dIns[,1]),2000)),] # don't plot too many values because it makes the PDFs too massive
|
||||
dDel=dDel[sample.int(length(dDel[,1]),min(length(dDel[,1]),2000)),] # don't plot too many values because it makes the PDFs too massive
|
||||
d=rbind(dSub, dIns, dDel)
|
||||
|
||||
if( cov != "QualityScore" ) {
|
||||
p <- ggplot(d, aes(x=CovariateValue,y=Accuracy,alpha=log10(Observations))) +
|
||||
geom_abline(intercept=0, slope=0, linetype=2) +
|
||||
xlab(paste(cov,"Covariate")) +
|
||||
ylab("Quality Score Accuracy") +
|
||||
blankTheme
|
||||
if(cov == "Cycle") {
|
||||
b <- p + geom_point(aes(color=Recalibration)) + scale_color_manual(values=c("maroon1","blue")) + facet_grid(.~EventType) +
|
||||
opts(axis.text.x=theme_text(angle=90, hjust=0))
|
||||
|
||||
p <- ggplot(d, aes(x=CovariateValue,y=AverageReportedQuality,alpha=log10(Observations))) +
|
||||
xlab(paste(cov,"Covariate")) +
|
||||
ylab("Mean Quality Score") +
|
||||
blankTheme
|
||||
e <- p + geom_point(aes(color=Recalibration)) + scale_color_manual(values=c("maroon1","blue")) + facet_grid(.~EventType) +
|
||||
opts(axis.text.x=theme_text(angle=90, hjust=0))
|
||||
|
||||
|
||||
} else {
|
||||
c <- p + geom_point(aes(color=Recalibration)) + scale_color_manual(values=c("maroon1","blue")) + facet_grid(.~EventType) +
|
||||
opts(axis.text.x=theme_text(angle=90, hjust=0)) + xlab(paste(cov,"Covariate (3 base suffix)"))
|
||||
p <- ggplot(d, aes(x=CovariateValue,y=AverageReportedQuality,alpha=log10(Observations))) +
|
||||
xlab(paste(cov,"Covariate (3 base suffix)")) +
|
||||
ylab("Mean Quality Score") +
|
||||
blankTheme
|
||||
f <- p + geom_point(aes(color=Recalibration)) + scale_color_manual(values=c("maroon1","blue")) + facet_grid(.~EventType) +
|
||||
opts(axis.text.x=theme_text(angle=90, hjust=0))
|
||||
|
||||
}
|
||||
} else {
|
||||
p <- ggplot(d, aes(x=AverageReportedQuality,y=EmpiricalQuality,alpha=log10(Observations))) +
|
||||
geom_abline(intercept=0, slope=1, linetype=2) +
|
||||
xlab("Reported Quality Score") +
|
||||
ylab("Empirical Quality Score") +
|
||||
blankTheme
|
||||
a <- p + geom_point(aes(color=Recalibration)) + scale_color_manual(values=c("maroon1","blue")) + facet_grid(.~EventType)
|
||||
|
||||
p <- ggplot(d, aes(x=CovariateValue)) +
|
||||
xlab(paste(cov,"Covariate")) +
|
||||
ylab("Number of Observations") +
|
||||
blankTheme
|
||||
d <- p + geom_histogram(aes(fill=Recalibration,weight=Observations),alpha=0.6,binwidth=1,position="identity") + scale_fill_manual(values=c("maroon1","blue")) + facet_grid(.~EventType) +
|
||||
scale_y_continuous(formatter="comma")
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
pdf(args[2],height=9,width=15)
|
||||
distributeGraphRows(list(a,b,c), c(1,1,1))
|
||||
distributeGraphRows(list(d,e,f), c(1,1,1))
|
||||
dev.off()
|
||||
|
||||
|
||||
if (exists('compactPDF')) {
|
||||
compactPDF(args[2])
|
||||
}
|
||||
|
|
@ -2,6 +2,7 @@ library(gsalib)
|
|||
library(ggplot2)
|
||||
library(gplots)
|
||||
library(tools)
|
||||
library(reshape)
|
||||
|
||||
#
|
||||
# Standard command line switch. Can we loaded interactively for development
|
||||
|
|
|
|||
|
|
@ -19,9 +19,9 @@ Medical and Population Genetics Program
|
|||
Maintainer: Kiran Garimella
|
||||
}
|
||||
\references{
|
||||
GSA wiki page: http://www.broadinstitute.org/gsa/wiki
|
||||
GSA wiki page: http://www.broadinstitute.org/gatk
|
||||
|
||||
GATK help forum: http://www.getsatisfaction.com/gsa
|
||||
GATK help forum: http://www.broadinstitute.org/gatk
|
||||
}
|
||||
\examples{
|
||||
## get script arguments in interactive and non-interactive mode
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ import java.util.Iterator;
|
|||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class AlignmentValidationWalker extends ReadWalker<Integer,Integer> {
|
||||
public class AlignmentValidation extends ReadWalker<Integer,Integer> {
|
||||
/**
|
||||
* The supporting BWT index generated using BWT.
|
||||
*/
|
||||
|
|
@ -48,7 +48,7 @@ import java.util.TreeMap;
|
|||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class CountBestAlignmentsWalker extends ReadWalker<Integer,Integer> {
|
||||
public class CountBestAlignments extends ReadWalker<Integer,Integer> {
|
||||
/**
|
||||
* The supporting BWT index generated using BWT.
|
||||
*/
|
||||
|
|
@ -1,113 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.analyzecovariates;
|
||||
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.utils.collections.NestedHashMap;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: rpoplin
|
||||
* Date: Dec 1, 2009
|
||||
*
|
||||
* The difference between this AnalysisDataManager and the RecalDataManager used by the Recalibration walkers is that here the collapsed data tables are indexed
|
||||
* by only read group and the given covariate, while in the recalibrator the collapsed tables are indexed by read group, reported quality, and the given covariate.
|
||||
*/
|
||||
|
||||
public class AnalysisDataManager {
|
||||
|
||||
private NestedHashMap dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
private ArrayList<NestedHashMap> dataCollapsedByCovariate; // Tables where everything except read group and given covariate has been collapsed
|
||||
|
||||
AnalysisDataManager() {
|
||||
}
|
||||
|
||||
AnalysisDataManager( final int numCovariates ) {
|
||||
dataCollapsedReadGroup = new NestedHashMap();
|
||||
dataCollapsedByCovariate = new ArrayList<NestedHashMap>();
|
||||
for( int iii = 0; iii < numCovariates - 1; iii++ ) { // readGroup isn't counted here, its table is separate
|
||||
dataCollapsedByCovariate.add( new NestedHashMap() );
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given mapping to all of the collapsed hash tables
|
||||
* @param key The list of comparables that is the key for this mapping
|
||||
* @param fullDatum The RecalDatum which is the data for this mapping
|
||||
* @param IGNORE_QSCORES_LESS_THAN The threshold in report quality for adding to the aggregate collapsed table
|
||||
*/
|
||||
public final void addToAllTables( final Object[] key, final RecalDatum fullDatum, final int IGNORE_QSCORES_LESS_THAN ) {
|
||||
|
||||
int qscore = Integer.parseInt( key[1].toString() );
|
||||
RecalDatum collapsedDatum;
|
||||
final Object[] readGroupCollapsedKey = new Object[1];
|
||||
final Object[] covariateCollapsedKey = new Object[2];
|
||||
|
||||
if( !(qscore < IGNORE_QSCORES_LESS_THAN) ) {
|
||||
// Create dataCollapsedReadGroup, the table where everything except read group has been collapsed
|
||||
readGroupCollapsedKey[0] = key[0]; // Make a new key with just the read group
|
||||
collapsedDatum = (RecalDatum)dataCollapsedReadGroup.get( readGroupCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedReadGroup.put( new RecalDatum(fullDatum), readGroupCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum ); // using combine instead of increment in order to calculate overall aggregateQReported
|
||||
}
|
||||
}
|
||||
|
||||
// Create dataCollapsedByCovariate's, the tables where everything except read group and given covariate has been collapsed
|
||||
for( int iii = 0; iii < dataCollapsedByCovariate.size(); iii++ ) {
|
||||
if( iii == 0 || !(qscore < IGNORE_QSCORES_LESS_THAN) ) { // use all data for the plot versus reported quality, but not for the other plots versus cycle and etc.
|
||||
covariateCollapsedKey[0] = key[0]; // Make a new key with the read group ...
|
||||
Object theCovariateElement = key[iii + 1]; // and the given covariate
|
||||
if( theCovariateElement != null ) {
|
||||
covariateCollapsedKey[1] = theCovariateElement;
|
||||
collapsedDatum = (RecalDatum)dataCollapsedByCovariate.get(iii).get( covariateCollapsedKey );
|
||||
if( collapsedDatum == null ) {
|
||||
dataCollapsedByCovariate.get(iii).put( new RecalDatum(fullDatum), covariateCollapsedKey );
|
||||
} else {
|
||||
collapsedDatum.combine( fullDatum );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the appropriate collapsed table out of the set of all the tables held by this Object
|
||||
* @param covariate Which covariate indexes the desired collapsed HashMap
|
||||
* @return The desired collapsed HashMap
|
||||
*/
|
||||
public final NestedHashMap getCollapsedTable( final int covariate ) {
|
||||
if( covariate == 0) {
|
||||
return dataCollapsedReadGroup; // Table where everything except read group has been collapsed
|
||||
} else {
|
||||
return dataCollapsedByCovariate.get( covariate - 1 ); // Table where everything except read group, quality score, and given covariate has been collapsed
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -1,383 +0,0 @@
|
|||
/*
|
||||
* Copyright (c) 2010 The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.analyzecovariates;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broadinstitute.sting.commandline.Argument;
|
||||
import org.broadinstitute.sting.commandline.Hidden;
|
||||
import org.broadinstitute.sting.commandline.CommandLineProgram;
|
||||
import org.broadinstitute.sting.commandline.Input;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.Covariate;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalDatum;
|
||||
import org.broadinstitute.sting.gatk.walkers.recalibration.RecalibrationArgumentCollection;
|
||||
import org.broadinstitute.sting.utils.R.RScriptExecutor;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.exceptions.DynamicClassResolutionException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.io.Resource;
|
||||
import org.broadinstitute.sting.utils.text.XReadLines;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
/**
|
||||
* Call R scripts to plot residual error versus the various covariates.
|
||||
*
|
||||
* <p>
|
||||
* After counting covariates in either the initial BAM File or again in the recalibrated BAM File, an analysis tool is available which
|
||||
* reads the .csv file and outputs several PDF (and .dat) files for each read group in the given BAM. These PDF files graphically
|
||||
* show the various metrics and characteristics of the reported quality scores (often in relation to the empirical qualities).
|
||||
* In order to show that any biases in the reported quality scores have been generally fixed through recalibration one should run
|
||||
* CountCovariates again on a bam file produced by TableRecalibration. In this way users can compare the analysis plots generated
|
||||
* by pre-recalibration and post-recalibration .csv files. Our usual chain of commands that we use to generate plots of residual
|
||||
* error is: CountCovariates, TableRecalibrate, samtools index on the recalibrated bam file, CountCovariates again on the recalibrated
|
||||
* bam file, and then AnalyzeCovariates on both the before and after recal_data.csv files to see the improvement in recalibration.
|
||||
*
|
||||
* <p>
|
||||
* The color coding along with the RMSE is included in the plots to give some indication of the number of observations that went into
|
||||
* each of the quality score estimates. It is defined as follows for N, the number of observations:
|
||||
*
|
||||
* <ul>
|
||||
* <li>light blue means N < 1,000</li>
|
||||
* <li>cornflower blue means 1,000 <= N < 10,000</li>
|
||||
* <li>dark blue means N >= 10,000</li>
|
||||
* <li>The pink dots indicate points whose quality scores are special codes used by the aligner and which are mathematically
|
||||
* meaningless and so aren't included in any of the numerical calculations.</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* NOTE: Rscript needs to be in your environment PATH (this is the scripting version of R, not the interactive version).
|
||||
* See <a target="r-project" href="http://www.r-project.org">http://www.r-project.org</a> for more info on how to download and install R.
|
||||
*
|
||||
* <p>
|
||||
* See the GATK wiki for a tutorial and example recalibration accuracy plots.
|
||||
* <a target="gatkwiki" href="http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration"
|
||||
* >http://www.broadinstitute.org/gsa/wiki/index.php/Base_quality_score_recalibration</a>
|
||||
*
|
||||
* <h2>Input</h2>
|
||||
* <p>
|
||||
* The recalibration table file in CSV format that was generated by the CountCovariates walker.
|
||||
* </p>
|
||||
*
|
||||
* <h2>Examples</h2>
|
||||
* <pre>
|
||||
* java -Xmx4g -jar AnalyzeCovariates.jar \
|
||||
* -recalFile /path/to/recal.table.csv \
|
||||
* -outputDir /path/to/output_dir/ \
|
||||
* -ignoreQ 5
|
||||
* </pre>
|
||||
*
|
||||
*/
|
||||
|
||||
@DocumentedGATKFeature(
|
||||
groupName = "AnalyzeCovariates",
|
||||
summary = "Package to plot residual accuracy versus error covariates for the base quality score recalibrator")
|
||||
public class AnalyzeCovariates extends CommandLineProgram {
|
||||
final private static Logger logger = Logger.getLogger(AnalyzeCovariates.class);
|
||||
|
||||
private static final String PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE = "plot_residualError_QualityScoreCovariate.R";
|
||||
private static final String PLOT_RESDIUAL_ERROR_OTHER_COVARIATE = "plot_residualError_OtherCovariate.R";
|
||||
private static final String PLOT_INDEL_QUALITY_RSCRIPT = "plot_indelQuality.R";
|
||||
|
||||
/////////////////////////////
|
||||
// Command Line Arguments
|
||||
/////////////////////////////
|
||||
/**
|
||||
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
|
||||
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
|
||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
*/
|
||||
@Input(fullName = "recal_file", shortName = "recalFile", doc = "The input recal csv file to analyze", required = false)
|
||||
private String RECAL_FILE = "output.recal_data.csv";
|
||||
@Argument(fullName = "output_dir", shortName = "outputDir", doc = "The directory in which to output all the plots and intermediate data files", required = false)
|
||||
private File OUTPUT_DIR = new File("analyzeCovariates");
|
||||
@Argument(fullName = "ignoreQ", shortName = "ignoreQ", doc = "Ignore bases with reported quality less than this number.", required = false)
|
||||
private int IGNORE_QSCORES_LESS_THAN = 5;
|
||||
@Argument(fullName = "numRG", shortName = "numRG", doc = "Only process N read groups. Default value: -1 (process all read groups)", required = false)
|
||||
private int NUM_READ_GROUPS_TO_PROCESS = -1; // -1 means process all read groups
|
||||
|
||||
/**
|
||||
* Combinations of covariates in which there are zero mismatches technically have infinite quality. We get around this situation
|
||||
* by capping at the specified value. We've found that Q40 is too low when using a more completely database of known variation like dbSNP build 132 or later.
|
||||
*/
|
||||
@Argument(fullName="max_quality_score", shortName="maxQ", required = false, doc="The integer value at which to cap the quality scores, default is 50")
|
||||
private int MAX_QUALITY_SCORE = 50;
|
||||
|
||||
/**
|
||||
* This argument is useful for comparing before/after plots and you want the axes to match each other.
|
||||
*/
|
||||
@Argument(fullName="max_histogram_value", shortName="maxHist", required = false, doc="If supplied, this value will be the max value of the histogram plots")
|
||||
private int MAX_HISTOGRAM_VALUE = 0;
|
||||
|
||||
@Hidden
|
||||
@Argument(fullName="do_indel_quality", shortName="indels", required = false, doc="If supplied, do indel quality plotting")
|
||||
private boolean DO_INDEL_QUALITY = false;
|
||||
|
||||
/////////////////////////////
|
||||
// Private Member Variables
|
||||
/////////////////////////////
|
||||
private AnalysisDataManager dataManager; // Holds the data HashMap, mostly used by TableRecalibrationWalker to create collapsed data hashmaps
|
||||
private ArrayList<Covariate> requestedCovariates; // List of covariates to be used in this calculation
|
||||
private final Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
private final Pattern OLD_RECALIBRATOR_HEADER = Pattern.compile("^rg,.*");
|
||||
private final Pattern COVARIATE_PATTERN = Pattern.compile("^ReadGroup,QualityScore,.*");
|
||||
protected static final String EOF_MARKER = "EOF";
|
||||
|
||||
protected int execute() {
|
||||
|
||||
// create the output directory where all the data tables and plots will go
|
||||
if (!OUTPUT_DIR.exists() && !OUTPUT_DIR.mkdirs())
|
||||
throw new UserException.BadArgumentValue("--output_dir/-outDir", "Unable to create output directory: " + OUTPUT_DIR);
|
||||
|
||||
if (!RScriptExecutor.RSCRIPT_EXISTS)
|
||||
Utils.warnUser(logger, "Rscript not found in environment path. Plots will not be generated.");
|
||||
|
||||
// initialize all the data from the csv file and allocate the list of covariates
|
||||
logger.info("Reading in input csv file...");
|
||||
initializeData();
|
||||
logger.info("...Done!");
|
||||
|
||||
// output data tables for Rscript to read in
|
||||
logger.info("Writing out intermediate tables for R...");
|
||||
writeDataTables();
|
||||
logger.info("...Done!");
|
||||
|
||||
// perform the analysis using Rscript and output the plots
|
||||
logger.info("Calling analysis R scripts and writing out figures...");
|
||||
callRScripts();
|
||||
logger.info("...Done!");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
private void initializeData() {
|
||||
|
||||
// Get a list of all available covariates
|
||||
Collection<Class<? extends Covariate>> classes = new PluginManager<Covariate>(Covariate.class).getPlugins();
|
||||
|
||||
int lineNumber = 0;
|
||||
boolean foundAllCovariates = false;
|
||||
|
||||
// Read in the covariates that were used from the input file
|
||||
requestedCovariates = new ArrayList<Covariate>();
|
||||
|
||||
try {
|
||||
for ( String line : new XReadLines(new File( RECAL_FILE )) ) {
|
||||
lineNumber++;
|
||||
if( COMMENT_PATTERN.matcher(line).matches() || OLD_RECALIBRATOR_HEADER.matcher(line).matches() || line.equals(EOF_MARKER) ) {
|
||||
; // Skip over the comment lines, (which start with '#')
|
||||
}
|
||||
else if( COVARIATE_PATTERN.matcher(line).matches() ) { // The line string is either specifying a covariate or is giving csv data
|
||||
if( foundAllCovariates ) {
|
||||
throw new RuntimeException( "Malformed input recalibration file. Found covariate names intermingled with data in file: " + RECAL_FILE );
|
||||
} else { // Found the covariate list in input file, loop through all of them and instantiate them
|
||||
String[] vals = line.split(",");
|
||||
for( int iii = 0; iii < vals.length - 3; iii++ ) { // There are n-3 covariates. The last three items are nObservations, nMismatch, and Qempirical
|
||||
boolean foundClass = false;
|
||||
for( Class<?> covClass : classes ) {
|
||||
if( (vals[iii] + "Covariate").equalsIgnoreCase( covClass.getSimpleName() ) ) {
|
||||
foundClass = true;
|
||||
try {
|
||||
Covariate covariate = (Covariate)covClass.newInstance();
|
||||
requestedCovariates.add( covariate );
|
||||
} catch (Exception e) {
|
||||
throw new DynamicClassResolutionException(covClass, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if( !foundClass ) {
|
||||
throw new RuntimeException( "Malformed input recalibration file. The requested covariate type (" + (vals[iii] + "Covariate") + ") isn't a valid covariate option." );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
} else { // Found a line of data
|
||||
if( !foundAllCovariates ) {
|
||||
|
||||
foundAllCovariates = true;
|
||||
|
||||
// At this point all the covariates should have been found and initialized
|
||||
if( requestedCovariates.size() < 2 ) {
|
||||
throw new RuntimeException( "Malformed input recalibration file. Covariate names can't be found in file: " + RECAL_FILE );
|
||||
}
|
||||
|
||||
// Initialize any covariate member variables using the shared argument collection
|
||||
for( Covariate cov : requestedCovariates ) {
|
||||
cov.initialize( new RecalibrationArgumentCollection() );
|
||||
}
|
||||
|
||||
// Initialize the data hashMaps
|
||||
dataManager = new AnalysisDataManager( requestedCovariates.size() );
|
||||
|
||||
}
|
||||
addCSVData(line); // Parse the line and add the data to the HashMap
|
||||
}
|
||||
}
|
||||
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException("Can not find input file: " + RECAL_FILE);
|
||||
} catch ( NumberFormatException e ) {
|
||||
throw new RuntimeException("Error parsing recalibration data at line " + lineNumber + ". Perhaps your table was generated by an older version of CovariateCounterWalker.");
|
||||
}
|
||||
}
|
||||
|
||||
private void addCSVData(String line) {
|
||||
String[] vals = line.split(",");
|
||||
|
||||
// Check if the data line is malformed, for example if the read group string contains a comma then it won't be parsed correctly
|
||||
if( vals.length != requestedCovariates.size() + 3 ) { // +3 because of nObservations, nMismatch, and Qempirical
|
||||
throw new RuntimeException("Malformed input recalibration file. Found data line with too many fields: " + line +
|
||||
" --Perhaps the read group string contains a comma and isn't being parsed correctly.");
|
||||
}
|
||||
|
||||
Object[] key = new Object[requestedCovariates.size()];
|
||||
Covariate cov;
|
||||
int iii;
|
||||
for( iii = 0; iii < requestedCovariates.size(); iii++ ) {
|
||||
cov = requestedCovariates.get( iii );
|
||||
key[iii] = cov.getValue( vals[iii] );
|
||||
}
|
||||
// Create a new datum using the number of observations, number of mismatches, and reported quality score
|
||||
RecalDatum datum = new RecalDatum( Long.parseLong( vals[iii] ), Long.parseLong( vals[iii + 1] ), Double.parseDouble( vals[1] ), 0.0 );
|
||||
// Add that datum to all the collapsed tables which will be used in the sequential calculation
|
||||
dataManager.addToAllTables( key, datum, IGNORE_QSCORES_LESS_THAN );
|
||||
}
|
||||
|
||||
private void writeDataTables() {
|
||||
|
||||
int numReadGroups = 0;
|
||||
|
||||
// for each read group
|
||||
for( Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet() ) {
|
||||
|
||||
if(NUM_READ_GROUPS_TO_PROCESS == -1 || ++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS) {
|
||||
String readGroup = readGroupKey.toString();
|
||||
RecalDatum readGroupDatum = (RecalDatum) dataManager.getCollapsedTable(0).data.get(readGroupKey);
|
||||
logger.info(String.format(
|
||||
"Writing out data tables for read group: %s\twith %s observations\tand aggregate residual error = %.3f",
|
||||
readGroup, readGroupDatum.getNumObservations(),
|
||||
readGroupDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE) - readGroupDatum.getEstimatedQReported()));
|
||||
|
||||
// for each covariate
|
||||
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
|
||||
Covariate cov = requestedCovariates.get(iii);
|
||||
|
||||
// Create a PrintStream
|
||||
File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
|
||||
PrintStream output;
|
||||
try {
|
||||
output = new PrintStream(FileUtils.openOutputStream(outputFile));
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotCreateOutputFile(outputFile, e);
|
||||
}
|
||||
|
||||
try {
|
||||
// Output the header
|
||||
output.println("Covariate\tQreported\tQempirical\tnMismatches\tnBases");
|
||||
|
||||
for( Object covariateKey : ((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).keySet()) {
|
||||
output.print( covariateKey.toString() + "\t" ); // Covariate
|
||||
RecalDatum thisDatum = (RecalDatum)((Map)dataManager.getCollapsedTable(iii).data.get(readGroupKey)).get(covariateKey);
|
||||
output.print( String.format("%.3f", thisDatum.getEstimatedQReported()) + "\t" ); // Qreported
|
||||
output.print( String.format("%.3f", thisDatum.empiricalQualDouble(0, MAX_QUALITY_SCORE)) + "\t" ); // Qempirical
|
||||
output.print( thisDatum.getNumMismatches() + "\t" ); // nMismatches
|
||||
output.println( thisDatum.getNumObservations() ); // nBases
|
||||
}
|
||||
} finally {
|
||||
// Close the PrintStream
|
||||
IOUtils.closeQuietly(output);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
private void callRScripts() {
|
||||
int numReadGroups = 0;
|
||||
|
||||
// for each read group
|
||||
for( Object readGroupKey : dataManager.getCollapsedTable(0).data.keySet() ) {
|
||||
if(++numReadGroups <= NUM_READ_GROUPS_TO_PROCESS || NUM_READ_GROUPS_TO_PROCESS == -1) {
|
||||
|
||||
String readGroup = readGroupKey.toString();
|
||||
logger.info("Analyzing read group: " + readGroup);
|
||||
|
||||
// for each covariate
|
||||
for( int iii = 1; iii < requestedCovariates.size(); iii++ ) {
|
||||
final Covariate cov = requestedCovariates.get(iii);
|
||||
final File outputFile = new File(OUTPUT_DIR, readGroup + "." + cov.getClass().getSimpleName()+ ".dat");
|
||||
if (DO_INDEL_QUALITY) {
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(PLOT_INDEL_QUALITY_RSCRIPT, AnalyzeCovariates.class));
|
||||
// The second argument is the name of the covariate in order to make the plots look nice
|
||||
executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
|
||||
executor.exec();
|
||||
} else {
|
||||
if( iii == 1 ) {
|
||||
// Analyze reported quality
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_QUALITY_SCORE_COVARIATE, AnalyzeCovariates.class));
|
||||
// The second argument is the Q scores that should be turned pink in the plot because they were ignored
|
||||
executor.addArgs(outputFile, IGNORE_QSCORES_LESS_THAN, MAX_QUALITY_SCORE, MAX_HISTOGRAM_VALUE);
|
||||
executor.exec();
|
||||
} else { // Analyze all other covariates
|
||||
RScriptExecutor executor = new RScriptExecutor();
|
||||
executor.addScript(new Resource(PLOT_RESDIUAL_ERROR_OTHER_COVARIATE, AnalyzeCovariates.class));
|
||||
// The second argument is the name of the covariate in order to make the plots look nice
|
||||
executor.addArgs(outputFile, cov.getClass().getSimpleName().split("Covariate")[0]);
|
||||
executor.exec();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else { // at the maximum number of read groups so break out
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String args[]) {
|
||||
try {
|
||||
AnalyzeCovariates clp = new AnalyzeCovariates();
|
||||
start(clp, args);
|
||||
System.exit(CommandLineProgram.result);
|
||||
} catch (Exception e) {
|
||||
exitSystemWithError(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
/**
|
||||
* Package to plot residual accuracy versus error covariates for the base quality score recalibrator.
|
||||
*/
|
||||
package org.broadinstitute.sting.analyzecovariates;
|
||||
|
|
@ -289,7 +289,7 @@ public abstract class ArgumentTypeDescriptor {
|
|||
return field.isAnnotationPresent(Hidden.class);
|
||||
}
|
||||
|
||||
public Class makeRawTypeIfNecessary(Type t) {
|
||||
public static Class makeRawTypeIfNecessary(Type t) {
|
||||
if ( t == null )
|
||||
return null;
|
||||
else if ( t instanceof ParameterizedType )
|
||||
|
|
@ -300,6 +300,115 @@ public abstract class ArgumentTypeDescriptor {
|
|||
throw new IllegalArgumentException("Unable to determine Class-derived component type of field: " + t);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The actual argument parsing method.
|
||||
* @param source source
|
||||
* @param type type to check
|
||||
* @param matches matches
|
||||
* @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding.
|
||||
*/
|
||||
protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) {
|
||||
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
|
||||
String value = getArgumentValue(defaultDefinition, matches);
|
||||
@SuppressWarnings("unchecked")
|
||||
Class<? extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type);
|
||||
String name = defaultDefinition.fullName;
|
||||
|
||||
return parseBinding(value, parameterType, type, name, tags, source.field.getName());
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param value The source of the binding
|
||||
* @param parameterType The Tribble Feature parameter type
|
||||
* @param bindingClass The class type for the binding (ex: RodBinding, IntervalBinding, etc.) Must have the correct constructor for creating the binding.
|
||||
* @param bindingName The name of the binding passed to the constructor.
|
||||
* @param tags Tags for the binding used for parsing and passed to the constructor.
|
||||
* @param fieldName The name of the field that was parsed. Used for error reporting.
|
||||
* @return The newly created binding object of type bindingClass.
|
||||
*/
|
||||
public static Object parseBinding(String value, Class<? extends Feature> parameterType, Type bindingClass,
|
||||
String bindingName, Tags tags, String fieldName) {
|
||||
try {
|
||||
String tribbleType = null;
|
||||
// must have one or two tag values here
|
||||
if ( tags.getPositionalTags().size() > 2 ) {
|
||||
throw new UserException.CommandLineException(
|
||||
String.format("Unexpected number of positional tags for argument %s : %s. " +
|
||||
"Rod bindings only support -X:type and -X:name,type argument styles",
|
||||
value, fieldName));
|
||||
} else if ( tags.getPositionalTags().size() == 2 ) {
|
||||
// -X:name,type style
|
||||
bindingName = tags.getPositionalTags().get(0);
|
||||
tribbleType = tags.getPositionalTags().get(1);
|
||||
|
||||
FeatureManager manager = new FeatureManager();
|
||||
if ( manager.getByName(tribbleType) == null )
|
||||
throw new UserException.UnknownTribbleType(
|
||||
tribbleType,
|
||||
String.format("Unable to find tribble type '%s' provided on the command line. " +
|
||||
"Please select a correct type from among the supported types:%n%s",
|
||||
tribbleType, manager.userFriendlyListOfAvailableFeatures(parameterType)));
|
||||
|
||||
} else {
|
||||
// case with 0 or 1 positional tags
|
||||
FeatureManager manager = new FeatureManager();
|
||||
|
||||
// -X:type style is a type when we cannot determine the type dynamically
|
||||
String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null;
|
||||
if ( tag1 != null ) {
|
||||
if ( manager.getByName(tag1) != null ) // this a type
|
||||
tribbleType = tag1;
|
||||
else
|
||||
bindingName = tag1;
|
||||
}
|
||||
|
||||
if ( tribbleType == null ) {
|
||||
// try to determine the file type dynamically
|
||||
File file = new File(value);
|
||||
if ( file.canRead() && file.isFile() ) {
|
||||
FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file);
|
||||
if ( featureDescriptor != null ) {
|
||||
tribbleType = featureDescriptor.getName();
|
||||
logger.info("Dynamically determined type of " + file + " to be " + tribbleType);
|
||||
}
|
||||
}
|
||||
|
||||
if ( tribbleType == null ) {
|
||||
// IntervalBinding can be created from a normal String
|
||||
Class rawType = (makeRawTypeIfNecessary(bindingClass));
|
||||
try {
|
||||
return rawType.getConstructor(String.class).newInstance(value);
|
||||
} catch (NoSuchMethodException e) {
|
||||
/* ignore */
|
||||
}
|
||||
|
||||
if ( ! file.exists() ) {
|
||||
throw new UserException.CouldNotReadInputFile(file, "file does not exist");
|
||||
} else if ( ! file.canRead() || ! file.isFile() ) {
|
||||
throw new UserException.CouldNotReadInputFile(file, "file could not be read");
|
||||
} else {
|
||||
throw new UserException.CommandLineException(
|
||||
String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
|
||||
"Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
|
||||
manager.userFriendlyListOfAvailableFeatures(parameterType)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
|
||||
return ctor.newInstance(parameterType, bindingName, value, tribbleType, tags);
|
||||
} catch (Exception e) {
|
||||
if ( e instanceof UserException )
|
||||
throw ((UserException)e);
|
||||
else
|
||||
throw new UserException.CommandLineException(
|
||||
String.format("Failed to parse value %s for argument %s. Message: %s",
|
||||
value, fieldName, e.getMessage()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -324,6 +433,7 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
public boolean createsTypeDefault(ArgumentSource source) { return ! source.isRequired(); }
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public Object createTypeDefault(ParsingEngine parsingEngine, ArgumentSource source, Type type) {
|
||||
Class parameterType = JVMUtils.getParameterizedTypeClass(type);
|
||||
return RodBinding.makeUnbound((Class<? extends Feature>)parameterType);
|
||||
|
|
@ -336,118 +446,16 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
|
||||
@Override
|
||||
public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) {
|
||||
return parse(parsingEngine, source, type, matches, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* The actual argument parsing method.
|
||||
*
|
||||
* IMPORTANT NOTE: the createIntervalBinding argument is a bit of a hack, but after discussions with SE we've decided
|
||||
* that it's the best way to proceed for now. IntervalBindings can either be proper RodBindings (hence the use of
|
||||
* this parse() method) or can be Strings (representing raw intervals or the files containing them). If createIntervalBinding
|
||||
* is true, we do not call parsingEngine.addRodBinding() because we don't want walkers to assume that these are the
|
||||
* usual set of RodBindings. It also allows us in the future to be smart about tagging rods as intervals. One other
|
||||
* side point is that we want to continue to allow the usage of non-Feature intervals so that users can theoretically
|
||||
* continue to input them out of order (whereas Tribble Features are ordered).
|
||||
*
|
||||
* @param parsingEngine parsing engine
|
||||
* @param source source
|
||||
* @param type type to check
|
||||
* @param matches matches
|
||||
* @param createIntervalBinding should we attempt to create an IntervalBinding instead of a RodBinding?
|
||||
* @return the RodBinding/IntervalBinding object depending on the value of createIntervalBinding.
|
||||
*/
|
||||
public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches, boolean createIntervalBinding) {
|
||||
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
|
||||
String value = getArgumentValue( defaultDefinition, matches );
|
||||
Class<? extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type);
|
||||
|
||||
try {
|
||||
String name = defaultDefinition.fullName;
|
||||
String tribbleType = null;
|
||||
Tags tags = getArgumentTags(matches);
|
||||
// must have one or two tag values here
|
||||
if ( tags.getPositionalTags().size() > 2 ) {
|
||||
throw new UserException.CommandLineException(
|
||||
String.format("Unexpected number of positional tags for argument %s : %s. " +
|
||||
"Rod bindings only support -X:type and -X:name,type argument styles",
|
||||
value, source.field.getName()));
|
||||
} if ( tags.getPositionalTags().size() == 2 ) {
|
||||
// -X:name,type style
|
||||
name = tags.getPositionalTags().get(0);
|
||||
tribbleType = tags.getPositionalTags().get(1);
|
||||
} else {
|
||||
// case with 0 or 1 positional tags
|
||||
FeatureManager manager = new FeatureManager();
|
||||
|
||||
// -X:type style is a type when we cannot determine the type dynamically
|
||||
String tag1 = tags.getPositionalTags().size() == 1 ? tags.getPositionalTags().get(0) : null;
|
||||
if ( tag1 != null ) {
|
||||
if ( manager.getByName(tag1) != null ) // this a type
|
||||
tribbleType = tag1;
|
||||
else
|
||||
name = tag1;
|
||||
}
|
||||
|
||||
if ( tribbleType == null ) {
|
||||
// try to determine the file type dynamically
|
||||
File file = new File(value);
|
||||
if ( file.canRead() && file.isFile() ) {
|
||||
FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file);
|
||||
if ( featureDescriptor != null ) {
|
||||
tribbleType = featureDescriptor.getName();
|
||||
logger.info("Dynamically determined type of " + file + " to be " + tribbleType);
|
||||
}
|
||||
}
|
||||
|
||||
if ( tribbleType == null ) {
|
||||
// IntervalBindings allow streaming conversion of Strings
|
||||
if ( createIntervalBinding ) {
|
||||
return new IntervalBinding(value);
|
||||
}
|
||||
|
||||
if ( ! file.exists() ) {
|
||||
throw new UserException.CouldNotReadInputFile(file, "file does not exist");
|
||||
} else if ( ! file.canRead() || ! file.isFile() ) {
|
||||
throw new UserException.CouldNotReadInputFile(file, "file could not be read");
|
||||
} else {
|
||||
throw new UserException.CommandLineException(
|
||||
String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
|
||||
"Please add an explicit type tag :NAME listing the correct type from among the supported types:%n%s",
|
||||
manager.userFriendlyListOfAvailableFeatures(parameterType)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Constructor ctor = (makeRawTypeIfNecessary(type)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
|
||||
Object result;
|
||||
if ( createIntervalBinding ) {
|
||||
result = ctor.newInstance(parameterType, name, value, tribbleType, tags);
|
||||
} else {
|
||||
RodBinding rbind = (RodBinding)ctor.newInstance(parameterType, name, value, tribbleType, tags);
|
||||
parsingEngine.addTags(rbind, tags);
|
||||
parsingEngine.addRodBinding(rbind);
|
||||
result = rbind;
|
||||
}
|
||||
return result;
|
||||
} catch (InvocationTargetException e) {
|
||||
throw new UserException.CommandLineException(
|
||||
String.format("Failed to parse value %s for argument %s.",
|
||||
value, source.field.getName()));
|
||||
} catch (Exception e) {
|
||||
if ( e instanceof UserException )
|
||||
throw ((UserException)e);
|
||||
else
|
||||
throw new UserException.CommandLineException(
|
||||
String.format("Failed to parse value %s for argument %s. Message: %s",
|
||||
value, source.field.getName(), e.getMessage()));
|
||||
}
|
||||
Tags tags = getArgumentTags(matches);
|
||||
RodBinding rbind = (RodBinding)parseBinding(source, type, matches, tags);
|
||||
parsingEngine.addTags(rbind, tags);
|
||||
parsingEngine.addRodBinding(rbind);
|
||||
return rbind;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parser for RodBinding objects
|
||||
* Parser for IntervalBinding objects
|
||||
*/
|
||||
class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
||||
/**
|
||||
|
|
@ -475,7 +483,7 @@ class IntervalBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
*/
|
||||
@Override
|
||||
public Object parse(ParsingEngine parsingEngine, ArgumentSource source, Type type, ArgumentMatches matches) {
|
||||
return new RodBindingArgumentTypeDescriptor().parse(parsingEngine, source, type, matches, true);
|
||||
return parseBinding(source, type, matches, getArgumentTags(matches));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -783,7 +791,7 @@ class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
}
|
||||
|
||||
Class<? extends Multiplexer> multiplexerType = dependentArgument.field.getAnnotation(Multiplex.class).value();
|
||||
Constructor<? extends Multiplexer> multiplexerConstructor = null;
|
||||
Constructor<? extends Multiplexer> multiplexerConstructor;
|
||||
try {
|
||||
multiplexerConstructor = multiplexerType.getConstructor(sourceTypes);
|
||||
multiplexerConstructor.setAccessible(true);
|
||||
|
|
@ -792,7 +800,7 @@ class MultiplexArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
throw new ReviewedStingException(String.format("Unable to find constructor for class %s with parameters %s",multiplexerType.getName(),Arrays.deepToString(sourceFields)),ex);
|
||||
}
|
||||
|
||||
Multiplexer multiplexer = null;
|
||||
Multiplexer multiplexer;
|
||||
try {
|
||||
multiplexer = multiplexerConstructor.newInstance(sourceValues);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -287,8 +287,8 @@ public abstract class CommandLineProgram {
|
|||
* a function used to indicate an error occurred in the command line tool
|
||||
*/
|
||||
private static void printDocumentationReference() {
|
||||
errorPrintf("Visit our wiki for extensive documentation http://www.broadinstitute.org/gsa/wiki%n");
|
||||
errorPrintf("Visit our forum to view answers to commonly asked questions http://getsatisfaction.com/gsa%n");
|
||||
errorPrintf("Visit our website and forum for extensive documentation and answers to %n");
|
||||
errorPrintf("commonly asked questions http://www.broadinstitute.org/gatk%n");
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -369,9 +369,9 @@ public abstract class CommandLineProgram {
|
|||
System.exit(1);
|
||||
}
|
||||
|
||||
public static void exitSystemWithSamError(final Exception e) {
|
||||
if ( e.getMessage() == null )
|
||||
throw new ReviewedStingException("SamException found with no message!", e);
|
||||
public static void exitSystemWithSamError(final Throwable t) {
|
||||
if ( t.getMessage() == null )
|
||||
throw new ReviewedStingException("SamException found with no message!", t);
|
||||
|
||||
errorPrintf("------------------------------------------------------------------------------------------%n");
|
||||
errorPrintf("A BAM ERROR has occurred (version %s): %n", CommandLineGATK.getVersionNumber());
|
||||
|
|
@ -383,7 +383,7 @@ public abstract class CommandLineProgram {
|
|||
errorPrintf("Also, please ensure that your BAM index is not corrupted: delete the current one and regenerate it with 'samtools index'%n");
|
||||
printDocumentationReference();
|
||||
errorPrintf("%n");
|
||||
errorPrintf("MESSAGE: %s%n", e.getMessage().trim());
|
||||
errorPrintf("MESSAGE: %s%n", t.getMessage().trim());
|
||||
errorPrintf("------------------------------------------------------------------------------------------%n");
|
||||
System.exit(1);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,19 +25,18 @@
|
|||
package org.broadinstitute.sting.commandline;
|
||||
|
||||
import com.google.java.contract.Requires;
|
||||
import org.broad.tribble.AbstractFeatureReader;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.readers.AsciiLineReader;
|
||||
import org.broad.tribble.FeatureReader;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
|
||||
import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -71,39 +70,31 @@ public final class IntervalBinding<T extends Feature> {
|
|||
return stringIntervals;
|
||||
}
|
||||
|
||||
public List<GenomeLoc> getIntervals(GenomeAnalysisEngine toolkit) {
|
||||
public List<GenomeLoc> getIntervals(final GenomeAnalysisEngine toolkit) {
|
||||
return getIntervals(toolkit.getGenomeLocParser());
|
||||
}
|
||||
|
||||
public List<GenomeLoc> getIntervals(final GenomeLocParser genomeLocParser) {
|
||||
List<GenomeLoc> intervals;
|
||||
|
||||
if ( featureIntervals != null ) {
|
||||
intervals = new ArrayList<GenomeLoc>();
|
||||
|
||||
//RMDTrackBuilder builder = new RMDTrackBuilder(toolkit.getReferenceDataSource().getReference().getSequenceDictionary(),
|
||||
// toolkit.getGenomeLocParser(),
|
||||
// toolkit.getArguments().unsafe);
|
||||
|
||||
// TODO -- after ROD system cleanup, go through the ROD system so that we can handle things like gzipped files
|
||||
|
||||
final FeatureCodec codec = new FeatureManager().getByName(featureIntervals.getTribbleType()).getCodec();
|
||||
if ( codec instanceof ReferenceDependentFeatureCodec )
|
||||
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(toolkit.getGenomeLocParser());
|
||||
((ReferenceDependentFeatureCodec)codec).setGenomeLocParser(genomeLocParser);
|
||||
try {
|
||||
final FileInputStream fis = new FileInputStream(new File(featureIntervals.getSource()));
|
||||
final AsciiLineReader lineReader = new AsciiLineReader(fis);
|
||||
codec.readHeader(lineReader);
|
||||
String line = lineReader.readLine();
|
||||
while ( line != null ) {
|
||||
final Feature feature = codec.decodeLoc(line);
|
||||
if ( feature == null )
|
||||
throw new UserException.MalformedFile(featureIntervals.getSource(), "Couldn't parse line '" + line + "'");
|
||||
intervals.add(toolkit.getGenomeLocParser().createGenomeLoc(feature));
|
||||
line = lineReader.readLine();
|
||||
}
|
||||
FeatureReader<Feature> reader = AbstractFeatureReader.getFeatureReader(featureIntervals.getSource(), codec, false);
|
||||
for ( Feature feature : reader.iterator() )
|
||||
intervals.add(genomeLocParser.createGenomeLoc(feature));
|
||||
} catch (Exception e) {
|
||||
throw new UserException.MalformedFile(featureIntervals.getSource(), "Problem reading the interval file", e);
|
||||
}
|
||||
|
||||
} else {
|
||||
intervals = IntervalUtils.parseIntervalArguments(toolkit.getGenomeLocParser(), stringIntervals);
|
||||
intervals = IntervalUtils.parseIntervalArguments(genomeLocParser, stringIntervals);
|
||||
}
|
||||
|
||||
return intervals;
|
||||
|
|
|
|||
|
|
@ -78,24 +78,7 @@ public abstract class ParsingMethod {
|
|||
|
||||
String argument = matcher.group(1).trim();
|
||||
|
||||
Tags tags = new Tags();
|
||||
if(matcher.group(2) != null) {
|
||||
for(String tag: Utils.split(matcher.group(2),",")) {
|
||||
// Check for presence of an '=' sign, indicating a key-value pair in the tag line.
|
||||
int equalDelimiterPos = tag.indexOf('=');
|
||||
if(equalDelimiterPos >= 0) {
|
||||
// Sanity check; ensure that there aren't multiple '=' in this key-value pair.
|
||||
if(tag.indexOf('=',equalDelimiterPos+1) >= 0)
|
||||
throw new ArgumentException(String.format("Tag %s passed to argument %s is malformed. Please ensure that " +
|
||||
"key-value tags are of the form <key>=<value>, and neither key " +
|
||||
"nor value contain the '=' character", tag, argument));
|
||||
tags.addKeyValueTag(tag.substring(0,equalDelimiterPos),tag.substring(equalDelimiterPos+1));
|
||||
}
|
||||
else
|
||||
tags.addPositionalTag(tag);
|
||||
|
||||
}
|
||||
}
|
||||
Tags tags = parseTags(argument, matcher.group(2));
|
||||
|
||||
// Find the most appropriate argument definition for the given argument.
|
||||
ArgumentDefinition argumentDefinition = definitions.findArgumentDefinition( argument, definitionMatcher );
|
||||
|
|
@ -105,6 +88,28 @@ public abstract class ParsingMethod {
|
|||
return new ArgumentMatch(argument,argumentDefinition,position,tags);
|
||||
}
|
||||
|
||||
public static Tags parseTags(String argument, String tagString) {
|
||||
Tags tags = new Tags();
|
||||
if (tagString != null) {
|
||||
for(String tag: Utils.split(tagString, ",")) {
|
||||
// Check for presence of an '=' sign, indicating a key-value pair in the tag line.
|
||||
int equalDelimiterPos = tag.indexOf('=');
|
||||
if(equalDelimiterPos >= 0) {
|
||||
// Sanity check; ensure that there aren't multiple '=' in this key-value pair.
|
||||
if(tag.indexOf('=',equalDelimiterPos+1) >= 0)
|
||||
throw new ArgumentException(String.format("Tag %s passed to argument %s is malformed. Please ensure that " +
|
||||
"key-value tags are of the form <key>=<value>, and neither key " +
|
||||
"nor value contain the '=' character", tag, argument));
|
||||
tags.addKeyValueTag(tag.substring(0,equalDelimiterPos),tag.substring(equalDelimiterPos+1));
|
||||
}
|
||||
else
|
||||
tags.addPositionalTag(tag);
|
||||
|
||||
}
|
||||
}
|
||||
return tags;
|
||||
}
|
||||
|
||||
/**
|
||||
* A command-line argument always starts with an alphabetical character or underscore followed by any word character.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -0,0 +1,38 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
import java.lang.annotation.*;
|
||||
|
||||
/**
|
||||
* Allows the walker to be tagged, they can have an unlimited number of categories.
|
||||
*/
|
||||
@Documented
|
||||
@Inherited
|
||||
@Retention(RetentionPolicy.RUNTIME)
|
||||
@Target(ElementType.TYPE)
|
||||
public @interface Categorize {
|
||||
public Category[] value() default {Category.UNCATEGORIZED};
|
||||
}
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk;
|
||||
|
||||
public enum Category {
|
||||
|
||||
BAM_PROCESSING("Walker", "BAM Processing and Analysis Tools"),
|
||||
|
||||
COMPANION("Walker", "Companion Utilities"),
|
||||
|
||||
CANCER_SPECIFIC("Walker", "Cancer-specific Variant Discovery Tools"),
|
||||
|
||||
QUALITY_CONTROL("Walker", "Quality Control and Simple Analysis Tools"),
|
||||
|
||||
VALIDATION("Walker", "Validation Utilities"),
|
||||
|
||||
VARIANT_DETECTION("ex", "Variant Detection"),
|
||||
|
||||
VARIANT_DISCOVERY("Walker", "Variant Discovery Tools"),
|
||||
|
||||
VARIANT_EVALUATION("Walker", "Variant Evaluation and Manipulation Tools"),
|
||||
|
||||
EXPERIMENTAL("Walker", "Miscellaneous Experimental (and Potentially Unstable) Tools"),
|
||||
|
||||
UNCATEGORIZED("all", "No category");
|
||||
|
||||
private final String description;
|
||||
|
||||
/**
|
||||
* Accepted strings are from the GATKDocWorkUnit.group
|
||||
*/
|
||||
private final String type;
|
||||
|
||||
private Category(String type, String description) {
|
||||
this.description = description;
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public String type() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
}
|
||||
|
|
@ -36,7 +36,9 @@ import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager;
|
|||
import org.broadinstitute.sting.gatk.walkers.Attribution;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.help.*;
|
||||
import org.broadinstitute.sting.utils.help.ApplicationDetails;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
import org.broadinstitute.sting.utils.help.GATKDocUtils;
|
||||
import org.broadinstitute.sting.utils.text.TextFormattingUtils;
|
||||
|
||||
import java.util.*;
|
||||
|
|
@ -101,20 +103,35 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
// TODO: Should Picard exceptions be, in general, UserExceptions or ReviewedStingExceptions?
|
||||
exitSystemWithError(e);
|
||||
} catch (SAMException e) {
|
||||
checkForTooManyOpenFilesProblem(e.getMessage());
|
||||
checkForMaskedUserErrors(e);
|
||||
exitSystemWithSamError(e);
|
||||
} catch (OutOfMemoryError e) {
|
||||
exitSystemWithUserError(new UserException.NotEnoughMemory());
|
||||
} catch (Throwable t) {
|
||||
checkForTooManyOpenFilesProblem(t.getMessage());
|
||||
checkForMaskedUserErrors(t);
|
||||
exitSystemWithError(t);
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkForTooManyOpenFilesProblem(String message) {
|
||||
// Special case the "Too many open files" error because it's a common User Error for which we know what to do
|
||||
if ( message != null && message.indexOf("Too many open files") != -1 )
|
||||
protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
|
||||
protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
|
||||
private static void checkForMaskedUserErrors(final Throwable t) {
|
||||
final String message = t.getMessage();
|
||||
if ( message == null )
|
||||
return;
|
||||
|
||||
// we know what to do about the common "Too many open files" error
|
||||
if ( message.indexOf("Too many open files") != -1 )
|
||||
exitSystemWithUserError(new UserException.TooManyOpenFiles());
|
||||
|
||||
// malformed BAM looks like a SAM file
|
||||
if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 ||
|
||||
message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 )
|
||||
exitSystemWithSamError(t);
|
||||
|
||||
// can't close tribble index when writing
|
||||
if ( message.indexOf("Unable to close index for") != -1 )
|
||||
exitSystemWithUserError(new UserException(t.getCause().getMessage()));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -126,8 +143,7 @@ public class CommandLineGATK extends CommandLineExecutable {
|
|||
List<String> header = new ArrayList<String>();
|
||||
header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime()));
|
||||
header.add("Copyright (c) 2010 The Broad Institute");
|
||||
header.add("Please view our documentation at http://www.broadinstitute.org/gsa/wiki");
|
||||
header.add("For support, please view our support site at http://getsatisfaction.com/gsa");
|
||||
header.add("For support and documentation go to http://www.broadinstitute.org/gatk");
|
||||
return header;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ import net.sf.samtools.SAMFileHeader;
|
|||
import net.sf.samtools.SAMRecord;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.readers.PositionalBufferedStream;
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
|
||||
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
|
||||
|
|
@ -51,13 +51,19 @@ import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
|
|||
import org.broadinstitute.sting.gatk.walkers.*;
|
||||
import org.broadinstitute.sting.utils.*;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.collections.Pair;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalUtils;
|
||||
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
|
||||
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
|
|
@ -192,7 +198,16 @@ public class GenomeAnalysisEngine {
|
|||
private BaseRecalibration baseRecalibration = null;
|
||||
public BaseRecalibration getBaseRecalibration() { return baseRecalibration; }
|
||||
public boolean hasBaseRecalibration() { return baseRecalibration != null; }
|
||||
public void setBaseRecalibration(File recalFile, int quantizationLevels) { baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels); }
|
||||
public void setBaseRecalibration(final File recalFile, final int quantizationLevels, final boolean disableIndelQuals, final int preserveQLessThan, final boolean emitOriginalQuals) {
|
||||
baseRecalibration = new BaseRecalibration(recalFile, quantizationLevels, disableIndelQuals, preserveQLessThan, emitOriginalQuals);
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method to determine whether this is the lite version of the GATK
|
||||
*/
|
||||
public boolean isGATKLite() {
|
||||
return GATKLiteUtils.isGATKLite();
|
||||
}
|
||||
|
||||
/**
|
||||
* Actually run the GATK with the specified walker.
|
||||
|
|
@ -204,8 +219,10 @@ public class GenomeAnalysisEngine {
|
|||
//monitor.start();
|
||||
setStartTime(new java.util.Date());
|
||||
|
||||
final GATKArgumentCollection args = this.getArguments();
|
||||
|
||||
// validate our parameters
|
||||
if (this.getArguments() == null) {
|
||||
if (args == null) {
|
||||
throw new ReviewedStingException("The GATKArgumentCollection passed to GenomeAnalysisEngine can not be null.");
|
||||
}
|
||||
|
||||
|
|
@ -213,12 +230,16 @@ public class GenomeAnalysisEngine {
|
|||
if (this.walker == null)
|
||||
throw new ReviewedStingException("The walker passed to GenomeAnalysisEngine can not be null.");
|
||||
|
||||
if (this.getArguments().nonDeterministicRandomSeed)
|
||||
if (args.nonDeterministicRandomSeed)
|
||||
resetRandomGenerator(System.currentTimeMillis());
|
||||
|
||||
// TODO -- REMOVE ME WHEN WE STOP BCF testing
|
||||
if ( args.USE_SLOW_GENOTYPES )
|
||||
GenotypeBuilder.MAKE_FAST_BY_DEFAULT = false;
|
||||
|
||||
// if the use specified an input BQSR recalibration table then enable on the fly recalibration
|
||||
if (this.getArguments().BQSR_RECAL_FILE != null)
|
||||
setBaseRecalibration(this.getArguments().BQSR_RECAL_FILE, this.getArguments().quantizationLevels);
|
||||
if (args.BQSR_RECAL_FILE != null)
|
||||
setBaseRecalibration(args.BQSR_RECAL_FILE, args.quantizationLevels, args.disableIndelQuals, args.PRESERVE_QSCORES_LESS_THAN, args.emitOriginalQuals);
|
||||
|
||||
// Determine how the threads should be divided between CPU vs. IO.
|
||||
determineThreadAllocation();
|
||||
|
|
@ -572,7 +593,6 @@ public class GenomeAnalysisEngine {
|
|||
* Setup the intervals to be processed
|
||||
*/
|
||||
protected void initializeIntervals() {
|
||||
|
||||
// return if no interval arguments at all
|
||||
if ( argCollection.intervals == null && argCollection.excludeIntervals == null )
|
||||
return;
|
||||
|
|
@ -580,17 +600,22 @@ public class GenomeAnalysisEngine {
|
|||
// Note that the use of '-L all' is no longer supported.
|
||||
|
||||
// if include argument isn't given, create new set of all possible intervals
|
||||
GenomeLocSortedSet includeSortedSet = (argCollection.intervals == null ?
|
||||
GenomeLocSortedSet.createSetFromSequenceDictionary(this.referenceDataSource.getReference().getSequenceDictionary()) :
|
||||
loadIntervals(argCollection.intervals, argCollection.intervalSetRule));
|
||||
|
||||
Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
|
||||
this.referenceDataSource,
|
||||
argCollection.intervals,
|
||||
argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
|
||||
argCollection.excludeIntervals);
|
||||
|
||||
GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
|
||||
GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
|
||||
|
||||
// if no exclude arguments, can return parseIntervalArguments directly
|
||||
if ( argCollection.excludeIntervals == null )
|
||||
if ( excludeSortedSet == null )
|
||||
intervals = includeSortedSet;
|
||||
|
||||
// otherwise there are exclude arguments => must merge include and exclude GenomeLocSortedSets
|
||||
else {
|
||||
GenomeLocSortedSet excludeSortedSet = loadIntervals(argCollection.excludeIntervals, IntervalSetRule.UNION);
|
||||
intervals = includeSortedSet.subtractRegions(excludeSortedSet);
|
||||
|
||||
// logging messages only printed when exclude (-XL) arguments are given
|
||||
|
|
@ -603,28 +628,6 @@ public class GenomeAnalysisEngine {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the intervals relevant to the current execution
|
||||
* @param argList argument bindings; might include filenames, intervals in samtools notation, or a combination of the above
|
||||
* @param rule interval merging rule
|
||||
* @return A sorted, merged list of all intervals specified in this arg list.
|
||||
*/
|
||||
protected GenomeLocSortedSet loadIntervals( List<IntervalBinding<Feature>> argList, IntervalSetRule rule ) {
|
||||
|
||||
List<GenomeLoc> allIntervals = new ArrayList<GenomeLoc>();
|
||||
for ( IntervalBinding intervalBinding : argList ) {
|
||||
List<GenomeLoc> intervals = intervalBinding.getIntervals(this);
|
||||
|
||||
if ( intervals.isEmpty() ) {
|
||||
logger.warn("The interval file " + intervalBinding.getSource() + " contains no intervals that could be parsed.");
|
||||
}
|
||||
|
||||
allIntervals = IntervalUtils.mergeListsBySetOperator(intervals, allIntervals, rule);
|
||||
}
|
||||
|
||||
return IntervalUtils.sortAndMergeIntervals(genomeLocParser, allIntervals, argCollection.intervalMerging);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add additional, externally managed IO streams for inputs.
|
||||
*
|
||||
|
|
@ -795,7 +798,18 @@ public class GenomeAnalysisEngine {
|
|||
SAMSequenceDictionary sequenceDictionary,
|
||||
GenomeLocParser genomeLocParser,
|
||||
ValidationExclusion.TYPE validationExclusionType) {
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser,validationExclusionType);
|
||||
VCFHeader header = null;
|
||||
if ( getArguments().repairVCFHeader != null ) {
|
||||
try {
|
||||
final PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(getArguments().repairVCFHeader));
|
||||
header = (VCFHeader)new VCFCodec().readHeader(pbs).getHeaderValue();
|
||||
pbs.close();
|
||||
} catch ( IOException e ) {
|
||||
throw new UserException.CouldNotReadInputFile(getArguments().repairVCFHeader, e);
|
||||
}
|
||||
}
|
||||
|
||||
RMDTrackBuilder builder = new RMDTrackBuilder(sequenceDictionary,genomeLocParser, header, validationExclusionType);
|
||||
|
||||
List<ReferenceOrderedDataSource> dataSources = new ArrayList<ReferenceOrderedDataSource>();
|
||||
for (RMDTriplet fileDescriptor : referenceMetaDataFiles)
|
||||
|
|
@ -819,6 +833,15 @@ public class GenomeAnalysisEngine {
|
|||
return readsDataSource.getHeader();
|
||||
}
|
||||
|
||||
public boolean lenientVCFProcessing() {
|
||||
return lenientVCFProcessing(argCollection.unsafe);
|
||||
}
|
||||
|
||||
public static boolean lenientVCFProcessing(final ValidationExclusion.TYPE val) {
|
||||
return val == ValidationExclusion.TYPE.ALL
|
||||
|| val == ValidationExclusion.TYPE.LENIENT_VCF_PROCESSING;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the unmerged SAM file header for an individual reader.
|
||||
* @param reader The reader.
|
||||
|
|
|
|||
|
|
@ -51,11 +51,6 @@ public class ReadProperties {
|
|||
return includeReadsWithDeletionAtLoci;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public boolean generateExtendedEvents() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets a list of the files acting as sources of reads.
|
||||
* @return A list of files storing reads data.
|
||||
|
|
|
|||
|
|
@ -49,7 +49,7 @@ public class WalkerManager extends PluginManager<Walker> {
|
|||
private ResourceBundle helpText;
|
||||
|
||||
public WalkerManager() {
|
||||
super(Walker.class,"walker","Walker");
|
||||
super(Walker.class,"walker","");
|
||||
helpText = TextFormattingUtils.loadResourceBundle("StingText");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ import org.broadinstitute.sting.gatk.DownsampleType;
|
|||
import org.broadinstitute.sting.gatk.DownsamplingMethod;
|
||||
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
|
||||
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
|
||||
import org.broadinstitute.sting.utils.QualityUtils;
|
||||
import org.broadinstitute.sting.utils.baq.BAQ;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
|
||||
import org.broadinstitute.sting.utils.interval.IntervalSetRule;
|
||||
|
|
@ -104,6 +105,12 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName = "interval_merging", shortName = "im", doc = "Indicates the interval merging rule we should use for abutting intervals", required = false)
|
||||
public IntervalMergingRule intervalMerging = IntervalMergingRule.ALL;
|
||||
|
||||
/**
|
||||
* For example, '-L chr1:100' with a padding value of 20 would turn into '-L chr1:80-120'.
|
||||
*/
|
||||
@Argument(fullName = "interval_padding", shortName = "ip", doc = "Indicates how many basepairs of padding to include around each of the intervals specified with the -L/--intervals argument", required = false)
|
||||
public int intervalPadding = 0;
|
||||
|
||||
@Input(fullName = "reference_sequence", shortName = "R", doc = "Reference sequence file", required = false)
|
||||
public File referenceFile = null;
|
||||
|
||||
|
|
@ -184,28 +191,61 @@ public class GATKArgumentCollection {
|
|||
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
|
||||
public Boolean useOriginalBaseQualities = false;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// BQSR arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* After the header, data records occur one per line until the end of the file. The first several items on a line are the
|
||||
* values of the individual covariates and will change depending on which covariates were specified at runtime. The last
|
||||
* three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches,
|
||||
* and the raw empirical quality score calculated by phred-scaling the mismatch rate.
|
||||
* Enables on-the-fly recalibrate of base qualities. The covariates tables are produced by the BaseQualityScoreRecalibrator tool.
|
||||
* Please be aware that one should only run recalibration with the covariates file created on the same input bam(s).
|
||||
*/
|
||||
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="Filename for the input covariates table recalibration .csv file which enables on the fly base quality score recalibration")
|
||||
public File BQSR_RECAL_FILE = null; // BUGBUG: need a better argument name once we decide how BQSRs v1 and v2 will live in the code base simultaneously
|
||||
@Input(fullName="BQSR", shortName="BQSR", required=false, doc="The input covariates table file which enables on-the-fly base quality score recalibration")
|
||||
public File BQSR_RECAL_FILE = null;
|
||||
|
||||
/**
|
||||
* Turns on the base quantization module. It requires a recalibration report (-BQSR).
|
||||
*
|
||||
* A value of 0 here means "do not quantize".
|
||||
* Any value greater than zero will be used to recalculate the quantization using this many levels.
|
||||
* Negative values do nothing (i.e. quantize using the recalibration report's quantization level -- same as not providing this parameter at all)
|
||||
* Any value greater than zero will be used to recalculate the quantization using that many levels.
|
||||
* Negative values mean that we should quantize using the recalibration report's quantization level.
|
||||
*/
|
||||
@Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels.", required=false)
|
||||
public int quantizationLevels = -1;
|
||||
@Hidden
|
||||
@Argument(fullName="quantize_quals", shortName = "qq", doc = "Quantize quality scores to a given number of levels (with -BQSR)", required=false)
|
||||
public int quantizationLevels = 0;
|
||||
|
||||
/**
|
||||
* Turns off printing of the base insertion and base deletion tags when using the -BQSR argument and only the base substitution qualities will be produced.
|
||||
*/
|
||||
@Argument(fullName="disable_indel_quals", shortName = "DIQ", doc = "If true, disables printing of base insertion and base deletion tags (with -BQSR)", required=false)
|
||||
public boolean disableIndelQuals = false;
|
||||
|
||||
/**
|
||||
* By default, the OQ tag in not emitted when using the -BQSR argument.
|
||||
*/
|
||||
@Argument(fullName="emit_original_quals", shortName = "EOQ", doc = "If true, enables printing of the OQ tag with the original base qualities (with -BQSR)", required=false)
|
||||
public boolean emitOriginalQuals = false;
|
||||
|
||||
/**
|
||||
* Do not modify quality scores less than this value but rather just write them out unmodified in the recalibrated BAM file.
|
||||
* In general it's unsafe to change qualities scores below < 6, since base callers use these values to indicate random or bad bases.
|
||||
* For example, Illumina writes Q2 bases when the machine has really gone wrong. This would be fine in and of itself,
|
||||
* but when you select a subset of these reads based on their ability to align to the reference and their dinucleotide effect,
|
||||
* your Q2 bin can be elevated to Q8 or Q10, leading to issues downstream.
|
||||
*/
|
||||
@Argument(fullName = "preserve_qscores_less_than", shortName = "preserveQ", doc = "Bases with quality scores less than this threshold won't be recalibrated (with -BQSR)", required = false)
|
||||
public int PRESERVE_QSCORES_LESS_THAN = QualityUtils.MIN_USABLE_Q_SCORE;
|
||||
|
||||
@Argument(fullName="defaultBaseQualities", shortName = "DBQ", doc = "If reads are missing some or all base quality scores, this value will be used for all base quality scores", required=false)
|
||||
public byte defaultBaseQualities = -1;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// Other utility arguments
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Argument(fullName = "validation_strictness", shortName = "S", doc = "How strict should we be with validation", required = false)
|
||||
public SAMFileReader.ValidationStringency strictnessLevel = SAMFileReader.ValidationStringency.SILENT;
|
||||
|
||||
|
|
@ -327,112 +367,27 @@ public class GATKArgumentCollection {
|
|||
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
//
|
||||
// methods
|
||||
// testing BCF2
|
||||
//
|
||||
// --------------------------------------------------------------------------------------------------------------
|
||||
|
||||
@Argument(fullName="generateShadowBCF",shortName = "generateShadowBCF",doc="If provided, whenever we create a VCFWriter we will also write out a BCF file alongside it, for testing purposes",required=false)
|
||||
@Hidden
|
||||
public boolean generateShadowBCF = false;
|
||||
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
|
||||
|
||||
@Argument(fullName="useSlowGenotypes",shortName = "useSlowGenotypes",doc="",required=false)
|
||||
@Hidden
|
||||
public boolean USE_SLOW_GENOTYPES = false;
|
||||
// TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed
|
||||
|
||||
/**
|
||||
* test equality between two arg collections. This function defines the statement:
|
||||
* "not fun to write"
|
||||
*
|
||||
* @param other the other collection
|
||||
*
|
||||
* @return true if they're equal
|
||||
* The file pointed to by this argument must be a VCF file. The GATK will read in just the header of this file
|
||||
* and then use the INFO, FORMAT, and FILTER field values from this file to repair the header file of any other
|
||||
* VCF file that GATK reads in. This allows us to have in effect a master set of header records and use these
|
||||
* to fill in any missing ones in input VCF files.
|
||||
*/
|
||||
public boolean equals(GATKArgumentCollection other) {
|
||||
if (other == null) return false;
|
||||
if (other.samFiles.size() != samFiles.size()) {
|
||||
return false;
|
||||
}
|
||||
for (int x = 0; x < samFiles.size(); x++) {
|
||||
if (!samFiles.get(x).equals(other.samFiles.get(x))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (other.walkerArgs.size() != walkerArgs.size()) {
|
||||
return false;
|
||||
}
|
||||
for (String s : walkerArgs.keySet()) {
|
||||
if (!other.walkerArgs.containsKey(s)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (!other.samFiles.equals(this.samFiles)) {
|
||||
return false;
|
||||
}
|
||||
if(other.readBufferSize == null || this.readBufferSize == null) {
|
||||
// If either is null, return false if they're both null, otherwise keep going...
|
||||
if(other.readBufferSize != null || this.readBufferSize != null)
|
||||
return false;
|
||||
}
|
||||
else {
|
||||
if(!other.readBufferSize.equals(this.readBufferSize))
|
||||
return false;
|
||||
}
|
||||
if (!(other.readBufferSize == null && this.readBufferSize == null) && (other.readBufferSize == null || this.readBufferSize == null)) {
|
||||
return false;
|
||||
}
|
||||
if (!other.strictnessLevel.equals(this.strictnessLevel)) {
|
||||
return false;
|
||||
}
|
||||
if (!other.referenceFile.equals(this.referenceFile)) {
|
||||
return false;
|
||||
}
|
||||
if ((other.intervals == null && this.intervals != null) || !other.intervals.equals(this.intervals)) {
|
||||
return false;
|
||||
}
|
||||
if (!other.excludeIntervals.equals(this.excludeIntervals)) {
|
||||
return false;
|
||||
}
|
||||
if (!other.unsafe.equals(this.unsafe)) {
|
||||
return false;
|
||||
}
|
||||
if ((other.downsampleFraction == null && this.downsampleFraction != null) ||
|
||||
(other.downsampleFraction != null && !other.downsampleFraction.equals(this.downsampleFraction))) {
|
||||
return false;
|
||||
}
|
||||
if ((other.downsampleCoverage == null && this.downsampleCoverage != null) ||
|
||||
(other.downsampleCoverage != null && !other.downsampleCoverage.equals(this.downsampleCoverage))) {
|
||||
return false;
|
||||
}
|
||||
if (!other.numberOfThreads.equals(this.numberOfThreads)) {
|
||||
return false;
|
||||
}
|
||||
if ((this.numberOfCPUThreads == null && other.numberOfCPUThreads != null) ||
|
||||
this.numberOfCPUThreads.equals(other.numberOfCPUThreads) ) {
|
||||
return false;
|
||||
}
|
||||
if ((this.numberOfIOThreads == null && other.numberOfIOThreads != null) ||
|
||||
this.numberOfIOThreads.equals(other.numberOfIOThreads) ) {
|
||||
return false;
|
||||
}
|
||||
if ((other.numberOfBAMFileHandles == null && this.numberOfBAMFileHandles != null) ||
|
||||
(other.numberOfBAMFileHandles != null && !other.numberOfBAMFileHandles.equals(this.numberOfBAMFileHandles))) {
|
||||
return false;
|
||||
}
|
||||
if (other.intervalMerging != this.intervalMerging) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (other.phoneHomeType != this.phoneHomeType) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (intervalSetRule != other.intervalSetRule)
|
||||
return false;
|
||||
|
||||
if ( BAQMode != other.BAQMode ) return false;
|
||||
if ( BAQGOP != other.BAQGOP ) return false;
|
||||
|
||||
if ((other.performanceLog == null && this.performanceLog != null) ||
|
||||
(other.performanceLog != null && !other.performanceLog.equals(this.performanceLog)))
|
||||
return false;
|
||||
|
||||
if (allowIntervalsWithUnindexedBAM != other.allowIntervalsWithUnindexedBAM)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Argument(fullName="repairVCFHeader", shortName = "repairVCFHeader", doc="If provided, whenever we read a VCF file we will use the header in this file to repair the header of the input VCF files", required=false)
|
||||
public File repairVCFHeader = null;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -40,6 +40,7 @@ public class ValidationExclusion {
|
|||
ALLOW_UNSET_BAM_SORT_ORDER, // assume that the bam is sorted, even if the SO (sort-order) flag is not set
|
||||
NO_READ_ORDER_VERIFICATION, // do not validate that the reads are in order as we take them from the bam file
|
||||
ALLOW_SEQ_DICT_INCOMPATIBILITY, // allow dangerous, but not fatal, sequence dictionary incompabilities
|
||||
LENIENT_VCF_PROCESSING, // allow non-standard values for standard VCF header lines. Don't worry about size differences between header and values, etc.
|
||||
@EnumerationArgumentDefault // set the ALL value to the default value, so if they specify just -U, we get the ALL
|
||||
ALL // do not check for all of the above conditions, DEFAULT
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,7 +28,6 @@ package org.broadinstitute.sting.gatk.contexts;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.HasGenomeLocation;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileup;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
|
||||
|
|
@ -89,36 +88,9 @@ public class AlignmentContext implements HasGenomeLocation {
|
|||
* @return
|
||||
*/
|
||||
public ReadBackedPileup getBasePileup() {
|
||||
if(!hasBasePileup())
|
||||
throw new ReviewedStingException("No base pileup is available. Please check for a base pileup with hasBasePileup() before attempting to retrieve a pileup.");
|
||||
return basePileup;
|
||||
}
|
||||
|
||||
/** Returns extended event (indel) pileup over the current genomic location. May return null if this context keeps
|
||||
* only base pileup.
|
||||
* @return
|
||||
*/
|
||||
@Deprecated
|
||||
public ReadBackedExtendedEventPileup getExtendedEventPileup() {
|
||||
if(!hasExtendedEventPileup())
|
||||
throw new ReviewedStingException("No extended event pileup is present.");
|
||||
return (ReadBackedExtendedEventPileup)basePileup;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if this alignment context keeps base pileup over the current genomic location.
|
||||
* TODO: Syntax of AlignmentContext uses hasBasePileup() / hasExtendedEventPileup() as an enumeration mechanism. Change this to a more sensible interface.
|
||||
* @return
|
||||
*/
|
||||
public boolean hasBasePileup() { return !(basePileup instanceof ReadBackedExtendedEventPileup); }
|
||||
|
||||
/** Returns true if this alignment context keeps extended event (indel) pileup over the current genomic location.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
@Deprecated
|
||||
public boolean hasExtendedEventPileup() { return basePileup instanceof ReadBackedExtendedEventPileup; }
|
||||
|
||||
/**
|
||||
* Returns true if any reads have been filtered out of the pileup due to excess DoC.
|
||||
* @return True if reads have been filtered out. False otherwise.
|
||||
|
|
|
|||
|
|
@ -116,19 +116,15 @@ public class AlignmentContextUtils {
|
|||
*
|
||||
**/
|
||||
public static Map<SAMReadGroupRecord, AlignmentContext> splitContextByReadGroup(AlignmentContext context, Collection<SAMReadGroupRecord> readGroups) {
|
||||
if ( ! context.hasBasePileup() ) {
|
||||
return Collections.emptyMap();
|
||||
} else {
|
||||
HashMap<SAMReadGroupRecord, AlignmentContext> contexts = new HashMap<SAMReadGroupRecord, AlignmentContext>();
|
||||
HashMap<SAMReadGroupRecord, AlignmentContext> contexts = new HashMap<SAMReadGroupRecord, AlignmentContext>();
|
||||
|
||||
for (SAMReadGroupRecord rg : readGroups) {
|
||||
ReadBackedPileup rgPileup = context.getBasePileup().getPileupForReadGroup(rg.getReadGroupId());
|
||||
if ( rgPileup != null ) // there we some reads for RG
|
||||
contexts.put(rg, new AlignmentContext(context.getLocation(), rgPileup));
|
||||
}
|
||||
|
||||
return contexts;
|
||||
for (SAMReadGroupRecord rg : readGroups) {
|
||||
ReadBackedPileup rgPileup = context.getBasePileup().getPileupForReadGroup(rg.getReadGroupId());
|
||||
if ( rgPileup != null ) // there we some reads for RG
|
||||
contexts.put(rg, new AlignmentContext(context.getLocation(), rgPileup));
|
||||
}
|
||||
|
||||
return contexts;
|
||||
}
|
||||
|
||||
public static Map<String, AlignmentContext> splitContextBySampleName(ReadBackedPileup pileup) {
|
||||
|
|
@ -139,32 +135,16 @@ public class AlignmentContextUtils {
|
|||
public static AlignmentContext joinContexts(Collection<AlignmentContext> contexts) {
|
||||
// validation
|
||||
GenomeLoc loc = contexts.iterator().next().getLocation();
|
||||
boolean isExtended = contexts.iterator().next().basePileup instanceof ReadBackedExtendedEventPileup;
|
||||
for(AlignmentContext context: contexts) {
|
||||
if(!loc.equals(context.getLocation()))
|
||||
throw new ReviewedStingException("Illegal attempt to join contexts from different genomic locations");
|
||||
if(isExtended != (context.basePileup instanceof ReadBackedExtendedEventPileup))
|
||||
throw new ReviewedStingException("Illegal attempt to join simple and extended contexts");
|
||||
}
|
||||
|
||||
AlignmentContext jointContext;
|
||||
if(isExtended) {
|
||||
List<ExtendedEventPileupElement> pe = new ArrayList<ExtendedEventPileupElement>();
|
||||
for(AlignmentContext context: contexts) {
|
||||
for(PileupElement pileupElement: context.basePileup)
|
||||
pe.add((ExtendedEventPileupElement)pileupElement);
|
||||
}
|
||||
jointContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc,pe));
|
||||
List<PileupElement> pe = new ArrayList<PileupElement>();
|
||||
for(AlignmentContext context: contexts) {
|
||||
for(PileupElement pileupElement: context.basePileup)
|
||||
pe.add(pileupElement);
|
||||
}
|
||||
else {
|
||||
List<PileupElement> pe = new ArrayList<PileupElement>();
|
||||
for(AlignmentContext context: contexts) {
|
||||
for(PileupElement pileupElement: context.basePileup)
|
||||
pe.add(pileupElement);
|
||||
}
|
||||
jointContext = new AlignmentContext(loc, new ReadBackedPileupImpl(loc,pe));
|
||||
}
|
||||
|
||||
return jointContext;
|
||||
return new AlignmentContext(loc, new ReadBackedPileupImpl(loc,pe));
|
||||
}
|
||||
}
|
||||
|
|
@ -27,13 +27,12 @@ package org.broadinstitute.sting.gatk.datasources.reads;
|
|||
import net.sf.picard.util.PeekableIterator;
|
||||
import net.sf.samtools.GATKBAMFileSpan;
|
||||
import net.sf.samtools.GATKChunk;
|
||||
import net.sf.samtools.SAMFileHeader;
|
||||
import net.sf.samtools.SAMFileSpan;
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.GenomeLocSortedSet;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
||||
|
|
@ -265,7 +264,10 @@ public class BAMScheduler implements Iterator<FilePointer> {
|
|||
// Naive algorithm: find all elements in current contig for proper schedule creation.
|
||||
List<GenomeLoc> lociInContig = new LinkedList<GenomeLoc>();
|
||||
for(GenomeLoc locus: loci) {
|
||||
if(!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded)
|
||||
if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()) == null)
|
||||
throw new ReviewedStingException("BAM file(s) do not have the contig: " + locus.getContig() + ". You are probably using a different reference than the one this file was aligned with");
|
||||
|
||||
if (!GenomeLoc.isUnmapped(locus) && dataSource.getHeader().getSequence(locus.getContig()).getSequenceIndex() == lastReferenceSequenceLoaded)
|
||||
lociInContig.add(locus);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.reads;
|
||||
|
||||
import net.sf.samtools.*;
|
||||
import org.broadinstitute.sting.gatk.CommandLineGATK;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
|
||||
|
|
@ -300,7 +301,7 @@ public class GATKBAMIndex {
|
|||
fileChannel = fileStream.getChannel();
|
||||
}
|
||||
catch (IOException exc) {
|
||||
throw new ReviewedStingException("Unable to open index file " + mFile, exc);
|
||||
throw new ReviewedStingException("Unable to open index file (" + exc.getMessage() +")" + mFile, exc);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -664,12 +664,12 @@ public class SAMDataSource {
|
|||
IndexedFastaSequenceFile refReader,
|
||||
BaseRecalibration bqsrApplier,
|
||||
byte defaultBaseQualities) {
|
||||
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
|
||||
// only wrap if we are replacing the original qualities or using a default base quality
|
||||
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
|
||||
|
||||
// NOTE: this (and other filtering) should be done before on-the-fly sorting
|
||||
// as there is no reason to sort something that we will end of throwing away
|
||||
// *********************************************************************************** //
|
||||
// * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
|
||||
// * (otherwise we will process something that we may end up throwing away) * //
|
||||
// *********************************************************************************** //
|
||||
|
||||
if (downsamplingFraction != null)
|
||||
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
|
||||
|
||||
|
|
@ -678,14 +678,18 @@ public class SAMDataSource {
|
|||
if (!noValidationOfReadOrder && enableVerification)
|
||||
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator);
|
||||
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||
|
||||
if (useOriginalBaseQualities || defaultBaseQualities >= 0)
|
||||
// only wrap if we are replacing the original qualities or using a default base quality
|
||||
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
|
||||
|
||||
if (bqsrApplier != null)
|
||||
wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier);
|
||||
|
||||
if (cmode != BAQ.CalculationMode.OFF)
|
||||
wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode);
|
||||
|
||||
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
|
||||
|
||||
return wrappedIterator;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@
|
|||
package org.broadinstitute.sting.gatk.datasources.rmd;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
|
|
@ -33,7 +34,7 @@ import java.util.*;
|
|||
/**
|
||||
* A pool of open resources, all of which can create a closeable iterator.
|
||||
*/
|
||||
abstract class ResourcePool <T,I extends Iterator> {
|
||||
abstract class ResourcePool <T,I extends CloseableIterator> {
|
||||
/**
|
||||
* Sequence dictionary.
|
||||
*/
|
||||
|
|
@ -109,6 +110,9 @@ abstract class ResourcePool <T,I extends Iterator> {
|
|||
T resource = resourceAssignments.get( iterator );
|
||||
Object obj = resourceAssignments.remove(iterator);
|
||||
|
||||
// Close the iterator.
|
||||
iterator.close();
|
||||
|
||||
// make sure we actually removed the assignment
|
||||
if (obj == null)
|
||||
throw new ReviewedStingException("Failed to remove resource assignment; target key had no associated value in the resource assignment map");
|
||||
|
|
|
|||
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The basic downsampler API, with no reads-specific operations
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface Downsampler<T> {
|
||||
|
||||
/*
|
||||
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine
|
||||
* immediately whether the item survives the downsampling process, while others will need to see
|
||||
* more items before making that determination.
|
||||
*/
|
||||
public void submit( T item );
|
||||
|
||||
/*
|
||||
* Submit a collection of items to the downsampler for consideration.
|
||||
*/
|
||||
public void submit( Collection<T> items );
|
||||
|
||||
/*
|
||||
* Are there items that have survived the downsampling process waiting to be retrieved?
|
||||
*/
|
||||
public boolean hasDownsampledItems();
|
||||
|
||||
/*
|
||||
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved.
|
||||
*/
|
||||
public List<T> consumeDownsampledItems();
|
||||
|
||||
/*
|
||||
* Are there items stored in this downsampler that it doesn't yet know whether they will
|
||||
* ultimately survive the downsampling process?
|
||||
*/
|
||||
public boolean hasPendingItems();
|
||||
|
||||
/*
|
||||
* Used to tell the downsampler that no more items will be submitted to it, and that it should
|
||||
* finalize any pending items.
|
||||
*/
|
||||
public void signalEndOfInput();
|
||||
|
||||
/*
|
||||
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state
|
||||
* information.
|
||||
*/
|
||||
public void clear();
|
||||
}
|
||||
|
|
@ -0,0 +1,98 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
|
||||
/**
|
||||
* StingSAMIterator wrapper around our generic reads downsampler interface
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class DownsamplingReadsIterator implements StingSAMIterator {
|
||||
|
||||
private StingSAMIterator nestedSAMIterator;
|
||||
private ReadsDownsampler<SAMRecord> downsampler;
|
||||
private Collection<SAMRecord> downsampledReadsCache;
|
||||
private Iterator<SAMRecord> downsampledReadsCacheIterator;
|
||||
|
||||
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
|
||||
nestedSAMIterator = iter;
|
||||
this.downsampler = downsampler;
|
||||
fillDownsampledReadsCache();
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
if ( downsampledReadsCacheIterator.hasNext() ) {
|
||||
return true;
|
||||
}
|
||||
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
public SAMRecord next() {
|
||||
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) {
|
||||
throw new NoSuchElementException("next() called when there are no more items");
|
||||
}
|
||||
|
||||
return downsampledReadsCacheIterator.next();
|
||||
}
|
||||
|
||||
private boolean fillDownsampledReadsCache() {
|
||||
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) {
|
||||
downsampler.submit(nestedSAMIterator.next());
|
||||
}
|
||||
|
||||
if ( ! nestedSAMIterator.hasNext() ) {
|
||||
downsampler.signalEndOfInput();
|
||||
}
|
||||
|
||||
downsampledReadsCache = downsampler.consumeDownsampledItems();
|
||||
downsampledReadsCacheIterator = downsampledReadsCache.iterator();
|
||||
|
||||
return downsampledReadsCacheIterator.hasNext();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Can not remove records from a SAM file via an iterator!");
|
||||
}
|
||||
|
||||
public void close() {
|
||||
nestedSAMIterator.close();
|
||||
}
|
||||
|
||||
public Iterator<SAMRecord> iterator() {
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Fractional Downsampler: selects a specified fraction of the reads for inclusion
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private ArrayList<T> selectedReads;
|
||||
|
||||
private int cutoffForInclusion;
|
||||
|
||||
private static final int RANDOM_POOL_SIZE = 10000;
|
||||
|
||||
public FractionalDownsampler( double fraction ) {
|
||||
if ( fraction < 0.0 || fraction > 1.0 ) {
|
||||
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
|
||||
}
|
||||
|
||||
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit( T newRead ) {
|
||||
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
|
||||
selectedReads.add(newRead);
|
||||
}
|
||||
}
|
||||
|
||||
public void submit( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
return selectedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
List<T> downsampledItems = selectedReads;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
selectedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,259 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Positional Downsampler: When eliminating reads, try to do so evenly based on the alignment start positions
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class PositionalDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private int targetCoverage;
|
||||
|
||||
private ReservoirDownsampler<T> reservoir;
|
||||
|
||||
private int currentContigIndex;
|
||||
|
||||
private int currentAlignmentStart;
|
||||
|
||||
private LinkedList<PositionalReadGrouping> pendingReads;
|
||||
|
||||
private ArrayList<T> finalizedReads;
|
||||
|
||||
public PositionalDownsampler ( int targetCoverage ) {
|
||||
this.targetCoverage = targetCoverage;
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit ( T newRead ) {
|
||||
if ( readIsPastCurrentPosition(newRead) ) {
|
||||
updateAndDownsamplePendingReads();
|
||||
}
|
||||
|
||||
reservoir.submit(newRead);
|
||||
updateCurrentPosition(newRead);
|
||||
}
|
||||
|
||||
public void submit ( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
return finalizedReads.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
List<T> toReturn = finalizedReads;
|
||||
finalizedReads = new ArrayList<T>();
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return pendingReads.size() > 0;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
updateAndDownsamplePendingReads();
|
||||
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
group.finalizeAllActiveReads();
|
||||
finalizedReads.addAll(group.getFinalizedReads());
|
||||
}
|
||||
|
||||
pendingReads.clear();
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir = new ReservoirDownsampler<T>(targetCoverage);
|
||||
pendingReads = new LinkedList<PositionalReadGrouping>();
|
||||
finalizedReads = new ArrayList<T>();
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return true;
|
||||
}
|
||||
|
||||
private void updateCurrentPosition ( T read ) {
|
||||
currentContigIndex = read.getReferenceIndex();
|
||||
currentAlignmentStart = read.getAlignmentStart();
|
||||
}
|
||||
|
||||
private boolean readIsPastCurrentPosition ( T read ) {
|
||||
return read.getReferenceIndex() != currentContigIndex || read.getAlignmentStart() > currentAlignmentStart;
|
||||
}
|
||||
|
||||
private void updateAndDownsamplePendingReads() {
|
||||
finalizeOutOfScopeReads();
|
||||
|
||||
List<T> oldLocusReads = reservoir.consumeDownsampledItems();
|
||||
pendingReads.add(new PositionalReadGrouping(oldLocusReads, currentContigIndex, currentAlignmentStart));
|
||||
|
||||
downsampleOverlappingGroups();
|
||||
}
|
||||
|
||||
private void finalizeOutOfScopeReads() {
|
||||
Iterator<PositionalReadGrouping> iter = pendingReads.iterator();
|
||||
boolean noPrecedingUnfinalizedGroups = true;
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
PositionalReadGrouping currentGroup = iter.next();
|
||||
currentGroup.finalizeActiveReadsBeforePosition(currentContigIndex, currentAlignmentStart);
|
||||
|
||||
if ( currentGroup.isFinalized() && noPrecedingUnfinalizedGroups ) {
|
||||
iter.remove();
|
||||
finalizedReads.addAll(currentGroup.getFinalizedReads());
|
||||
}
|
||||
else {
|
||||
noPrecedingUnfinalizedGroups = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void downsampleOverlappingGroups() {
|
||||
int[] groupReadCounts = new int[pendingReads.size()];
|
||||
int totalCoverage = 0;
|
||||
int numActiveGroups = 0;
|
||||
int currentGroup = 0;
|
||||
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
groupReadCounts[currentGroup] = group.numActiveReads();
|
||||
totalCoverage += groupReadCounts[currentGroup];
|
||||
|
||||
if ( groupReadCounts[currentGroup] > 0 ) {
|
||||
numActiveGroups++;
|
||||
}
|
||||
|
||||
currentGroup++;
|
||||
}
|
||||
|
||||
if ( totalCoverage <= targetCoverage ) {
|
||||
return;
|
||||
}
|
||||
|
||||
int numReadsToRemove = Math.min(totalCoverage - targetCoverage, totalCoverage - numActiveGroups);
|
||||
currentGroup = 0;
|
||||
|
||||
while ( numReadsToRemove > 0 ) {
|
||||
if ( groupReadCounts[currentGroup] > 1 ) {
|
||||
groupReadCounts[currentGroup]--;
|
||||
numReadsToRemove--;
|
||||
}
|
||||
|
||||
currentGroup = (currentGroup + 1) % groupReadCounts.length;
|
||||
}
|
||||
|
||||
currentGroup = 0;
|
||||
for ( PositionalReadGrouping group : pendingReads ) {
|
||||
if ( ! group.isFinalized() ) {
|
||||
group.downsampleActiveReads(groupReadCounts[currentGroup]);
|
||||
}
|
||||
currentGroup++;
|
||||
}
|
||||
}
|
||||
|
||||
private class PositionalReadGrouping {
|
||||
private List<T> activeReads;
|
||||
private List<T> finalizedReads;
|
||||
|
||||
private int contig;
|
||||
private int alignmentStart;
|
||||
|
||||
public PositionalReadGrouping( Collection<T> reads, int contig, int alignmentStart ) {
|
||||
activeReads = new LinkedList<T>(reads);
|
||||
finalizedReads = new ArrayList<T>();
|
||||
this.contig = contig;
|
||||
this.alignmentStart = alignmentStart;
|
||||
}
|
||||
|
||||
public int numActiveReads() {
|
||||
return activeReads.size();
|
||||
}
|
||||
|
||||
public boolean isFinalized() {
|
||||
return activeReads.size() == 0;
|
||||
}
|
||||
|
||||
public List<T> getFinalizedReads() {
|
||||
return finalizedReads;
|
||||
}
|
||||
|
||||
public void finalizeActiveReadsBeforePosition( int contig, int position ) {
|
||||
if ( this.contig != contig ) {
|
||||
finalizeAllActiveReads();
|
||||
return;
|
||||
}
|
||||
|
||||
Iterator<T> iter = activeReads.iterator();
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
T read = iter.next();
|
||||
if ( read.getAlignmentEnd() < position ) {
|
||||
iter.remove();
|
||||
finalizedReads.add(read);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void finalizeAllActiveReads() {
|
||||
finalizedReads.addAll(activeReads);
|
||||
activeReads.clear();
|
||||
}
|
||||
|
||||
public void downsampleActiveReads( int numReadsToKeep ) {
|
||||
if ( numReadsToKeep > activeReads.size() || numReadsToKeep < 0 ) {
|
||||
throw new ReviewedStingException(String.format("Cannot retain %d reads out of %d total reads",
|
||||
numReadsToKeep, activeReads.size()));
|
||||
}
|
||||
|
||||
BitSet itemsToKeep = new BitSet(activeReads.size());
|
||||
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(activeReads.size(), numReadsToKeep) ) {
|
||||
itemsToKeep.set(selectedIndex);
|
||||
}
|
||||
|
||||
int currentIndex = 0;
|
||||
Iterator<T> iter = activeReads.iterator();
|
||||
|
||||
while ( iter.hasNext() ) {
|
||||
T read = iter.next();
|
||||
|
||||
if ( ! itemsToKeep.get(currentIndex) ) {
|
||||
iter.remove();
|
||||
}
|
||||
|
||||
currentIndex++;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
|
||||
/**
|
||||
* An extension of the basic downsampler API with reads-specific operations
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public interface ReadsDownsampler<T extends SAMRecord> extends Downsampler<T> {
|
||||
|
||||
/*
|
||||
* Does this downsampler require that reads be fed to it in coordinate order?
|
||||
*/
|
||||
public boolean requiresCoordinateSortOrder();
|
||||
}
|
||||
|
|
@ -0,0 +1,106 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.downsampling;
|
||||
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Reservoir Downsampler: Selects n reads out of a stream whose size is not known in advance, with
|
||||
* every read in the stream having an equal chance of being selected for inclusion.
|
||||
*
|
||||
* An implementation of "Algorithm R" from the paper "Random Sampling with a Reservoir" (Jeffrey Scott Vitter, 1985)
|
||||
*
|
||||
* @author David Roazen
|
||||
*/
|
||||
public class ReservoirDownsampler<T extends SAMRecord> implements ReadsDownsampler<T> {
|
||||
|
||||
private ArrayList<T> reservoir;
|
||||
|
||||
private int targetSampleSize;
|
||||
|
||||
private int totalReadsSeen;
|
||||
|
||||
public ReservoirDownsampler ( int targetSampleSize ) {
|
||||
if ( targetSampleSize <= 0 ) {
|
||||
throw new ReviewedStingException("Cannot do reservoir downsampling with a sample size <= 0");
|
||||
}
|
||||
|
||||
this.targetSampleSize = targetSampleSize;
|
||||
clear();
|
||||
}
|
||||
|
||||
public void submit ( T newRead ) {
|
||||
totalReadsSeen++;
|
||||
|
||||
if ( totalReadsSeen <= targetSampleSize ) {
|
||||
reservoir.add(newRead);
|
||||
}
|
||||
else {
|
||||
int randomSlot = GenomeAnalysisEngine.getRandomGenerator().nextInt(totalReadsSeen);
|
||||
if ( randomSlot < targetSampleSize ) {
|
||||
reservoir.set(randomSlot, newRead);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void submit ( Collection<T> newReads ) {
|
||||
for ( T read : newReads ) {
|
||||
submit(read);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasDownsampledItems() {
|
||||
return reservoir.size() > 0;
|
||||
}
|
||||
|
||||
public List<T> consumeDownsampledItems() {
|
||||
List<T> downsampledItems = reservoir;
|
||||
clear();
|
||||
return downsampledItems;
|
||||
}
|
||||
|
||||
public boolean hasPendingItems() {
|
||||
return false;
|
||||
}
|
||||
|
||||
public void signalEndOfInput() {
|
||||
// NO-OP
|
||||
}
|
||||
|
||||
public void clear() {
|
||||
reservoir = new ArrayList<T>(targetSampleSize);
|
||||
totalReadsSeen = 0;
|
||||
}
|
||||
|
||||
public boolean requiresCoordinateSortOrder() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
@ -32,7 +32,8 @@ import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
|
|||
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidSNPGenotypePriors;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.genotyper.DiploidGenotype;
|
||||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
|
||||
|
||||
|
|
@ -68,8 +69,8 @@ public class GATKPaperGenotyper extends LocusWalker<Integer,Long> implements Tre
|
|||
if (ref.getBase() == 'N' || ref.getBase() == 'n') return null; // we don't deal with the N ref base case
|
||||
|
||||
ReadBackedPileup pileup = context.getBasePileup().getPileupWithoutMappingQualityZeroReads();
|
||||
double likelihoods[] = DiploidSNPGenotypePriors.getReferencePolarizedPriors(ref.getBase(),
|
||||
DiploidSNPGenotypePriors.HUMAN_HETEROZYGOSITY,
|
||||
double likelihoods[] = getReferencePolarizedPriors(ref.getBase(),
|
||||
UnifiedGenotyperEngine.HUMAN_SNP_HETEROZYGOSITY,
|
||||
0.01);
|
||||
// get the bases and qualities from the pileup
|
||||
byte bases[] = pileup.getBases();
|
||||
|
|
@ -104,10 +105,106 @@ public class GATKPaperGenotyper extends LocusWalker<Integer,Long> implements Tre
|
|||
}
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations. In this case we simply return an empty list
|
||||
* Takes reference base, and three priors for hom-ref, het, hom-var, and fills in the priors vector
|
||||
* appropriately.
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
* Suppose A is the reference base, and we are given the probability of being hom-ref, het, and hom-var,
|
||||
* and that pTriSateGenotype is the true probability of observing reference A and a true genotype of B/C
|
||||
* then this sets the priors to:
|
||||
*
|
||||
* AA = hom-ref
|
||||
* AC = AG = AT = (het - pTriStateGenotype) / 3
|
||||
* CC = GG = TT = hom-var / 3
|
||||
* CG = CT = GT = pTriStateGenotype / 3
|
||||
*
|
||||
* So that we get:
|
||||
*
|
||||
* hom-ref + 3 * (het - pTriStateGenotype) / 3 + 3 * hom-var / 3 + 3 * pTriStateGenotype
|
||||
* hom-ref + het - pTriStateGenotype + hom-var + pTriStateGenotype
|
||||
* hom-ref + het + hom-var
|
||||
* = 1
|
||||
*
|
||||
* @param ref
|
||||
* @param heterozyosity
|
||||
* @param pRefError
|
||||
*/
|
||||
public static double[] getReferencePolarizedPriors(byte ref, double heterozyosity, double pRefError ) {
|
||||
if ( ! MathUtils.isBounded(pRefError, 0.0, 0.01) ) {
|
||||
throw new RuntimeException(String.format("BUG: p Reference error is out of bounds (0.0 - 0.01) is allow range %f", pRefError));
|
||||
}
|
||||
|
||||
double pTriStateGenotype = heterozyosity * pRefError;
|
||||
// if ( pTriStateGenotype >= heterozyosity ) {
|
||||
// throw new RuntimeException(String.format("p Tristate genotype %f is greater than the heterozygosity %f", pTriStateGenotype, heterozyosity));
|
||||
// }
|
||||
|
||||
double pHomRef = heterozygosity2HomRefProbability(heterozyosity);
|
||||
double pHet = heterozygosity2HetProbability(heterozyosity);
|
||||
double pHomVar = heterozygosity2HomVarProbability(heterozyosity);
|
||||
|
||||
if (MathUtils.compareDoubles(pHomRef + pHet + pHomVar, 1.0) != 0) {
|
||||
throw new RuntimeException(String.format("BUG: Prior probabilities don't sum to one => %f, %f, %f", pHomRef, pHet, pHomVar));
|
||||
}
|
||||
|
||||
double[] priors = new double[DiploidGenotype.values().length];
|
||||
|
||||
for ( DiploidGenotype g : DiploidGenotype.values() ) {
|
||||
double POfG;
|
||||
|
||||
final double nOnRefHets = 3;
|
||||
final double nOffRefHets = 3;
|
||||
final double nHomVars = 3;
|
||||
|
||||
if ( g.isHomRef(ref) ) { POfG = pHomRef; }
|
||||
else if ( g.isHomVar(ref) ) { POfG = pHomVar / nHomVars; }
|
||||
else if ( g.isHetRef(ref) ) { POfG = (pHet - pTriStateGenotype ) / nOnRefHets; }
|
||||
else { POfG = pTriStateGenotype / nOffRefHets; }
|
||||
|
||||
priors[g.ordinal()] = Math.log10(POfG);
|
||||
}
|
||||
|
||||
return priors;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param h
|
||||
* @return
|
||||
*/
|
||||
public static double heterozygosity2HomRefProbability(double h) {
|
||||
if (MathUtils.isNegative(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
double v = 1.0 - (3.0 * h / 2.0);
|
||||
if (MathUtils.isNegative(v)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
public static double heterozygosity2HetProbability(double h) {
|
||||
if (MathUtils.isNegative(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
return h;
|
||||
}
|
||||
|
||||
public static double heterozygosity2HomVarProbability(double h) {
|
||||
if (MathUtils.isNegative(h)) {
|
||||
throw new RuntimeException(String.format("Heterozygous value is bad %f", h));
|
||||
}
|
||||
|
||||
return h / 2.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Provide an initial value for reduce computations. In this case we simply return an empty list
|
||||
*
|
||||
* @return Initial value of reduce.
|
||||
*/
|
||||
public Long reduceInit() {
|
||||
return 0L;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
|
|||
import org.broadinstitute.sting.gatk.walkers.TreeReducible;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.StingException;
|
||||
import org.broadinstitute.sting.utils.threading.ThreadPoolMonitor;
|
||||
|
||||
import java.util.Collection;
|
||||
|
|
@ -256,7 +255,8 @@ public class HierarchicalMicroScheduler extends MicroScheduler implements Hierar
|
|||
// Specifically catch Tribble I/O exceptions and rethrow them as Reviewed. We don't expect
|
||||
// any issues here because we created the Tribble output file mere moments ago and expect it to
|
||||
// be completely valid.
|
||||
throw new ReviewedStingException("Unable to merge temporary Tribble output file.",ex);
|
||||
final String reason = ex.getMessage();
|
||||
throw new ReviewedStingException("Unable to merge temporary Tribble output file" + (reason == null ? "." : (" (" + reason + ").")), ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
|
|||
import org.broadinstitute.sting.gatk.io.DirectOutputTracker;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.SampleUtils;
|
||||
|
||||
|
|
|
|||
|
|
@ -6,11 +6,11 @@ import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider;
|
|||
import org.broadinstitute.sting.gatk.datasources.reads.Shard;
|
||||
import org.broadinstitute.sting.gatk.io.ThreadLocalOutputTracker;
|
||||
import org.broadinstitute.sting.gatk.traversals.TraversalEngine;
|
||||
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
|
||||
import org.broadinstitute.sting.gatk.walkers.Walker;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
/**
|
||||
* User: hanna
|
||||
* Date: Apr 29, 2009
|
||||
|
|
@ -79,6 +79,10 @@ public class ShardTraverser implements Callable {
|
|||
microScheduler.reportShardTraverseTime(endTime-startTime);
|
||||
|
||||
return accumulator;
|
||||
} catch(Throwable t) {
|
||||
// Notify that an exception has occurred
|
||||
microScheduler.handleException(new ExecutionException(t));
|
||||
throw new RuntimeException(t);
|
||||
} finally {
|
||||
synchronized(this) {
|
||||
complete = true;
|
||||
|
|
|
|||
|
|
@ -39,9 +39,11 @@ import net.sf.samtools.SAMRecord;
|
|||
public class BadCigarFilter extends ReadFilter {
|
||||
|
||||
public boolean filterOut(final SAMRecord rec) {
|
||||
Cigar c = rec.getCigar();
|
||||
final Cigar c = rec.getCigar();
|
||||
if( c.isEmpty() ) { return false; } // if there is no Cigar then it can't be bad
|
||||
|
||||
boolean previousElementWasIndel = false;
|
||||
CigarOperator lastOp = c.getCigarElement(0).getOperator();
|
||||
CigarOperator lastOp = c.getCigarElement(0).getOperator();
|
||||
|
||||
if (lastOp == CigarOperator.D) // filter out reads starting with deletion
|
||||
return true;
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package org.broadinstitute.sting.gatk.filters;
|
||||
|
||||
import net.sf.picard.filter.SamRecordFilter;
|
||||
import net.sf.samtools.SAMRecord;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
|
||||
|
||||
|
|
@ -16,4 +17,19 @@ public abstract class ReadFilter implements SamRecordFilter {
|
|||
* @param engine the engine.
|
||||
*/
|
||||
public void initialize(GenomeAnalysisEngine engine) {}
|
||||
|
||||
|
||||
/**
|
||||
* Determines whether a pair of SAMRecord matches this filter
|
||||
*
|
||||
* @param first the first SAMRecord to evaluate
|
||||
* @param second the second SAMRecord to evaluate
|
||||
*
|
||||
* @return true if the SAMRecords matches the filter, otherwise false
|
||||
* @throws UnsupportedOperationException when paired filter not implemented
|
||||
*/
|
||||
@Override
|
||||
public boolean filterOut(final SAMRecord first, final SAMRecord second) {
|
||||
throw new UnsupportedOperationException("Paired filter not implemented: " + this.getClass());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ package org.broadinstitute.sting.gatk.io.storage;
|
|||
import org.broadinstitute.sting.gatk.io.stubs.OutputStreamStub;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.SAMFileWriterStub;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.Stub;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -77,12 +77,12 @@ public class StorageFactory {
|
|||
else
|
||||
storage = new SAMFileWriterStorage((SAMFileWriterStub)stub);
|
||||
}
|
||||
else if(stub instanceof VCFWriterStub) {
|
||||
VCFWriterStub vcfWriterStub = (VCFWriterStub)stub;
|
||||
else if(stub instanceof VariantContextWriterStub) {
|
||||
VariantContextWriterStub vcfWriterStub = (VariantContextWriterStub)stub;
|
||||
if( file != null )
|
||||
storage = new VCFWriterStorage(vcfWriterStub,file);
|
||||
storage = new VariantContextWriterStorage(vcfWriterStub,file);
|
||||
else
|
||||
storage = new VCFWriterStorage(vcfWriterStub);
|
||||
storage = new VariantContextWriterStorage(vcfWriterStub);
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("Unsupported stub type: " + stub.getClass().getName());
|
||||
|
|
|
|||
|
|
@ -1,128 +0,0 @@
|
|||
package org.broadinstitute.sting.gatk.io.storage;
|
||||
|
||||
import net.sf.samtools.util.BlockCompressedOutputStream;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.source.BasicFeatureSource;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.VCFWriterStub;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.StandardVCFWriter;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.io.PrintStream;
|
||||
|
||||
/**
|
||||
* Provides temporary and permanent storage for genotypes in VCF format.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class VCFWriterStorage implements Storage<VCFWriterStorage>, VCFWriter {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(VCFWriterStorage.class);
|
||||
|
||||
protected final File file;
|
||||
protected OutputStream stream;
|
||||
protected final VCFWriter writer;
|
||||
|
||||
/**
|
||||
* Constructs an object which will write directly into the output file provided by the stub.
|
||||
* Intentionally delaying the writing of the header -- this should be filled in by the walker.
|
||||
* @param stub Stub to use when constructing the output file.
|
||||
*/
|
||||
public VCFWriterStorage( VCFWriterStub stub ) {
|
||||
if ( stub.getFile() != null ) {
|
||||
this.file = stub.getFile();
|
||||
writer = vcfWriterToFile(stub,stub.getFile(),true);
|
||||
}
|
||||
else if ( stub.getOutputStream() != null ) {
|
||||
this.file = null;
|
||||
this.stream = stub.getOutputStream();
|
||||
writer = new StandardVCFWriter(stream, stub.getMasterSequenceDictionary(), stub.doNotWriteGenotypes());
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
|
||||
}
|
||||
|
||||
/**
|
||||
* common initialization routine for multiple constructors
|
||||
* @param stub Stub to use when constructing the output file.
|
||||
* @param file Target file into which to write VCF records.
|
||||
* @param indexOnTheFly true to index the file on the fly. NOTE: will be forced to false for compressed files.
|
||||
* @return A VCF writer for use with this class
|
||||
*/
|
||||
private StandardVCFWriter vcfWriterToFile(VCFWriterStub stub, File file, boolean indexOnTheFly) {
|
||||
try {
|
||||
if ( stub.isCompressed() )
|
||||
stream = new BlockCompressedOutputStream(file);
|
||||
else
|
||||
stream = new PrintStream(file);
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
|
||||
}
|
||||
|
||||
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
|
||||
return new StandardVCFWriter(file, this.stream, stub.getMasterSequenceDictionary(), indexOnTheFly && !stub.isCompressed(), stub.doNotWriteGenotypes());
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Constructs an object which will redirect into a different file.
|
||||
* @param stub Stub to use when synthesizing file / header info.
|
||||
* @param tempFile File into which to direct the output data.
|
||||
*/
|
||||
public VCFWriterStorage(VCFWriterStub stub, File tempFile) {
|
||||
logger.debug("Creating temporary VCF file " + tempFile.getAbsolutePath() + " for VCF output.");
|
||||
this.file = tempFile;
|
||||
this.writer = vcfWriterToFile(stub, file, false);
|
||||
writer.writeHeader(stub.getVCFHeader());
|
||||
}
|
||||
|
||||
public void add(VariantContext vc) {
|
||||
writer.add(vc);
|
||||
}
|
||||
|
||||
/**
|
||||
* initialize this VCF header
|
||||
*
|
||||
* @param header the header
|
||||
*/
|
||||
public void writeHeader(VCFHeader header) {
|
||||
writer.writeHeader(header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the VCF storage object.
|
||||
*/
|
||||
public void close() {
|
||||
if(file != null)
|
||||
logger.debug("Closing temporary file " + file.getAbsolutePath());
|
||||
writer.close();
|
||||
}
|
||||
|
||||
public void mergeInto(VCFWriterStorage target) {
|
||||
try {
|
||||
String sourceFilePath = file.getAbsolutePath();
|
||||
String targetFilePath = target.file != null ? target.file.getAbsolutePath() : "/dev/stdin";
|
||||
logger.debug(String.format("Merging %s into %s",sourceFilePath,targetFilePath));
|
||||
BasicFeatureSource<VariantContext> source = BasicFeatureSource.getFeatureSource(file.getAbsolutePath(), new VCFCodec(), false);
|
||||
|
||||
for ( VariantContext vc : source.iterator() ) {
|
||||
target.writer.add(vc);
|
||||
}
|
||||
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,193 @@
|
|||
/*
|
||||
* Copyright (c) 2012, The Broad Institute
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person
|
||||
* obtaining a copy of this software and associated documentation
|
||||
* files (the "Software"), to deal in the Software without
|
||||
* restriction, including without limitation the rights to use,
|
||||
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following
|
||||
* conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
||||
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
||||
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
||||
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
||||
* OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
package org.broadinstitute.sting.gatk.io.storage;
|
||||
|
||||
import net.sf.samtools.util.BlockCompressedOutputStream;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.AbstractFeatureReader;
|
||||
import org.broadinstitute.sting.gatk.io.stubs.VariantContextWriterStub;
|
||||
import org.broadinstitute.sting.utils.codecs.bcf2.BCF2Utils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriterFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Provides temporary and permanent storage for genotypes in VCF format.
|
||||
*
|
||||
* @author mhanna
|
||||
* @version 0.1
|
||||
*/
|
||||
public class VariantContextWriterStorage implements Storage<VariantContextWriterStorage>, VariantContextWriter {
|
||||
/**
|
||||
* our log, which we want to capture anything from this class
|
||||
*/
|
||||
private static Logger logger = Logger.getLogger(VariantContextWriterStorage.class);
|
||||
|
||||
private final static int BUFFER_SIZE = 1048576;
|
||||
|
||||
protected final File file;
|
||||
protected OutputStream stream;
|
||||
protected final VariantContextWriter writer;
|
||||
|
||||
/**
|
||||
* Constructs an object which will write directly into the output file provided by the stub.
|
||||
* Intentionally delaying the writing of the header -- this should be filled in by the walker.
|
||||
* @param stub Stub to use when constructing the output file.
|
||||
*/
|
||||
public VariantContextWriterStorage(VariantContextWriterStub stub) {
|
||||
if ( stub.getFile() != null ) {
|
||||
this.file = stub.getFile();
|
||||
writer = vcfWriterToFile(stub,stub.getFile(),true);
|
||||
}
|
||||
else if ( stub.getOutputStream() != null ) {
|
||||
this.file = null;
|
||||
this.stream = stub.getOutputStream();
|
||||
writer = VariantContextWriterFactory.create(stream,
|
||||
stub.getMasterSequenceDictionary(), stub.getWriterOptions(false));
|
||||
}
|
||||
else
|
||||
throw new ReviewedStingException("Unable to create target to which to write; storage was provided with neither a file nor a stream.");
|
||||
}
|
||||
|
||||
/**
|
||||
* common initialization routine for multiple constructors
|
||||
* @param stub Stub to use when constructing the output file.
|
||||
* @param file Target file into which to write VCF records.
|
||||
* @param indexOnTheFly true to index the file on the fly. NOTE: will be forced to false for compressed files.
|
||||
* @return A VCF writer for use with this class
|
||||
*/
|
||||
private VariantContextWriter vcfWriterToFile(VariantContextWriterStub stub, File file, boolean indexOnTheFly) {
|
||||
try {
|
||||
if ( stub.isCompressed() )
|
||||
stream = new BlockCompressedOutputStream(file);
|
||||
else
|
||||
stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(file), BUFFER_SIZE));
|
||||
}
|
||||
catch(IOException ex) {
|
||||
throw new UserException.CouldNotCreateOutputFile(file, "Unable to open target output stream", ex);
|
||||
}
|
||||
|
||||
// The GATK/Tribble can't currently index block-compressed files on the fly. Disable OTF indexing even if the user explicitly asked for it.
|
||||
EnumSet<Options> options = stub.getWriterOptions(indexOnTheFly);
|
||||
VariantContextWriter writer = VariantContextWriterFactory.create(file, this.stream, stub.getMasterSequenceDictionary(), options);
|
||||
|
||||
// if the stub says to test BCF, create a secondary writer to BCF and an 2 way out writer to send to both
|
||||
// TODO -- remove me when argument generateShadowBCF is removed
|
||||
if ( stub.alsoWriteBCFForTest() && ! VariantContextWriterFactory.isBCFOutput(file, options)) {
|
||||
final File bcfFile = BCF2Utils.shadowBCF(file);
|
||||
if ( bcfFile != null ) {
|
||||
VariantContextWriter bcfWriter = VariantContextWriterFactory.create(bcfFile, stub.getMasterSequenceDictionary(), options);
|
||||
writer = new TestWriter(writer, bcfWriter);
|
||||
}
|
||||
}
|
||||
|
||||
return writer;
|
||||
}
|
||||
|
||||
private final static class TestWriter implements VariantContextWriter {
|
||||
final List<VariantContextWriter> writers;
|
||||
|
||||
private TestWriter(final VariantContextWriter ... writers) {
|
||||
this.writers = Arrays.asList(writers);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeHeader(final VCFHeader header) {
|
||||
for ( final VariantContextWriter writer : writers ) writer.writeHeader(header);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
for ( final VariantContextWriter writer : writers ) writer.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void add(final VariantContext vc) {
|
||||
for ( final VariantContextWriter writer : writers ) writer.add(vc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Constructs an object which will redirect into a different file.
|
||||
* @param stub Stub to use when synthesizing file / header info.
|
||||
* @param tempFile File into which to direct the output data.
|
||||
*/
|
||||
public VariantContextWriterStorage(VariantContextWriterStub stub, File tempFile) {
|
||||
logger.debug("Creating temporary VCF file " + tempFile.getAbsolutePath() + " for VCF output.");
|
||||
this.file = tempFile;
|
||||
this.writer = vcfWriterToFile(stub, file, false);
|
||||
writer.writeHeader(stub.getVCFHeader());
|
||||
}
|
||||
|
||||
public void add(VariantContext vc) {
|
||||
writer.add(vc);
|
||||
}
|
||||
|
||||
/**
|
||||
* initialize this VCF header
|
||||
*
|
||||
* @param header the header
|
||||
*/
|
||||
public void writeHeader(VCFHeader header) {
|
||||
writer.writeHeader(header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the VCF storage object.
|
||||
*/
|
||||
public void close() {
|
||||
if(file != null)
|
||||
logger.debug("Closing temporary file " + file.getAbsolutePath());
|
||||
writer.close();
|
||||
}
|
||||
|
||||
public void mergeInto(VariantContextWriterStorage target) {
|
||||
try {
|
||||
String sourceFilePath = file.getAbsolutePath();
|
||||
String targetFilePath = target.file != null ? target.file.getAbsolutePath() : "/dev/stdin";
|
||||
logger.debug(String.format("Merging %s into %s",sourceFilePath,targetFilePath));
|
||||
AbstractFeatureReader<VariantContext> source = AbstractFeatureReader.getFeatureReader(file.getAbsolutePath(), new VCFCodec(), false);
|
||||
|
||||
for ( VariantContext vc : source.iterator() ) {
|
||||
target.writer.add(vc);
|
||||
}
|
||||
|
||||
source.close();
|
||||
} catch (IOException e) {
|
||||
throw new UserException.CouldNotReadInputFile(file, "Error reading file in VCFWriterStorage: ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -116,9 +116,9 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
|
|||
String compressionLevelText = getArgumentValue( createBAMCompressionArgumentDefinition(source), matches );
|
||||
Integer compressionLevel = compressionLevelText != null ? Integer.valueOf(compressionLevelText) : null;
|
||||
|
||||
Boolean indexOnTheFly = !argumentIsPresent(disableWriteIndexArgumentDefinition(source),matches) ? true : null;
|
||||
Boolean generateMD5 = argumentIsPresent(this.enableMD5GenerationArgumentDefinition(source),matches) ? true : null;
|
||||
Boolean simplifyBAM = argumentIsPresent(createSimplifyBAMArgumentDefinition(source),matches);
|
||||
boolean indexOnTheFly = !argumentIsPresent(disableWriteIndexArgumentDefinition(source),matches);
|
||||
boolean generateMD5 = argumentIsPresent(this.enableMD5GenerationArgumentDefinition(source),matches);
|
||||
boolean simplifyBAM = argumentIsPresent(createSimplifyBAMArgumentDefinition(source),matches);
|
||||
|
||||
// Validate the combination of parameters passed in.
|
||||
|
||||
|
|
@ -132,15 +132,19 @@ public class SAMFileWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor
|
|||
}
|
||||
|
||||
// Create the stub and set parameters.
|
||||
SAMFileWriterStub stub = new SAMFileWriterStub(engine, new File(writerFileName));
|
||||
SAMFileWriterStub stub;
|
||||
if ( writerFileName != null )
|
||||
stub = new SAMFileWriterStub(engine, new File(writerFileName));
|
||||
else
|
||||
stub = new SAMFileWriterStub(engine, defaultOutputStream);
|
||||
|
||||
if( compressionLevel != null )
|
||||
if ( compressionLevel != null )
|
||||
stub.setCompressionLevel(compressionLevel);
|
||||
if(indexOnTheFly != null)
|
||||
if ( indexOnTheFly )
|
||||
stub.setIndexOnTheFly(indexOnTheFly);
|
||||
if(generateMD5 != null)
|
||||
if ( generateMD5 )
|
||||
stub.setGenerateMD5(generateMD5);
|
||||
if(simplifyBAM != null)
|
||||
if ( simplifyBAM )
|
||||
stub.setSimplifyBAM(simplifyBAM);
|
||||
|
||||
// WARNING: Side effects required by engine!
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.io.stubs;
|
|||
|
||||
import org.broadinstitute.sting.commandline.*;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
|
||||
import java.io.File;
|
||||
|
|
@ -45,7 +45,7 @@ import java.util.List;
|
|||
* @version 0.1
|
||||
*/
|
||||
public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
||||
public static final String NO_HEADER_ARG_NAME = "NO_HEADER";
|
||||
public static final String NO_HEADER_ARG_NAME = "no_cmdline_in_header";
|
||||
public static final String SITES_ONLY_ARG_NAME = "sites_only";
|
||||
public static final HashSet<String> SUPPORTED_ZIPPED_SUFFIXES = new HashSet<String>();
|
||||
|
||||
|
|
@ -91,12 +91,12 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
*/
|
||||
@Override
|
||||
public boolean supports( Class type ) {
|
||||
return VCFWriter.class.equals(type);
|
||||
return VariantContextWriter.class.equals(type);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ArgumentDefinition> createArgumentDefinitions( ArgumentSource source ) {
|
||||
return Arrays.asList( createDefaultArgumentDefinition(source),createNoHeaderArgumentDefinition(),createSitesOnlyArgumentDefinition());
|
||||
return Arrays.asList( createDefaultArgumentDefinition(source), createNoCommandLineHeaderArgumentDefinition(),createSitesOnlyArgumentDefinition());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -117,7 +117,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
public Object createTypeDefault(ParsingEngine parsingEngine,ArgumentSource source, Type type) {
|
||||
if(!source.isRequired())
|
||||
throw new ReviewedStingException("BUG: tried to create type default for argument type descriptor that can't support a type default.");
|
||||
VCFWriterStub stub = new VCFWriterStub(engine, defaultOutputStream, false, argumentSources, false, false);
|
||||
VariantContextWriterStub stub = new VariantContextWriterStub(engine, defaultOutputStream, false, argumentSources, false, false);
|
||||
engine.addOutput(stub);
|
||||
return stub;
|
||||
}
|
||||
|
|
@ -144,12 +144,12 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
// Should we compress the output stream?
|
||||
boolean compress = isCompressed(writerFileName);
|
||||
|
||||
boolean skipWritingHeader = argumentIsPresent(createNoHeaderArgumentDefinition(),matches);
|
||||
boolean skipWritingCmdLineHeader = argumentIsPresent(createNoCommandLineHeaderArgumentDefinition(),matches);
|
||||
boolean doNotWriteGenotypes = argumentIsPresent(createSitesOnlyArgumentDefinition(),matches);
|
||||
|
||||
// Create a stub for the given object.
|
||||
VCFWriterStub stub = (writerFile != null) ? new VCFWriterStub(engine, writerFile, compress, argumentSources, skipWritingHeader, doNotWriteGenotypes)
|
||||
: new VCFWriterStub(engine, defaultOutputStream, compress, argumentSources, skipWritingHeader, doNotWriteGenotypes);
|
||||
VariantContextWriterStub stub = (writerFile != null) ? new VariantContextWriterStub(engine, writerFile, compress, argumentSources, skipWritingCmdLineHeader, doNotWriteGenotypes)
|
||||
: new VariantContextWriterStub(engine, defaultOutputStream, compress, argumentSources, skipWritingCmdLineHeader, doNotWriteGenotypes);
|
||||
|
||||
// WARNING: Side effects required by engine!
|
||||
parsingEngine.addTags(stub,getArgumentTags(matches));
|
||||
|
|
@ -162,7 +162,7 @@ public class VCFWriterArgumentTypeDescriptor extends ArgumentTypeDescriptor {
|
|||
* Creates the optional compression level argument for the BAM file.
|
||||
* @return Argument definition for the BAM file itself. Will not be null.
|
||||
*/
|
||||
private ArgumentDefinition createNoHeaderArgumentDefinition() {
|
||||
private ArgumentDefinition createNoCommandLineHeaderArgumentDefinition() {
|
||||
return new ArgumentDefinition( ArgumentIOType.ARGUMENT,
|
||||
boolean.class,
|
||||
NO_HEADER_ARG_NAME,
|
||||
|
|
|
|||
|
|
@ -25,20 +25,24 @@
|
|||
package org.broadinstitute.sting.gatk.io.stubs;
|
||||
|
||||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.SAMSequenceRecord;
|
||||
import org.broadinstitute.sting.gatk.CommandLineExecutable;
|
||||
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
|
||||
import org.broadinstitute.sting.gatk.io.OutputTracker;
|
||||
import org.broadinstitute.sting.utils.classloader.JVMUtils;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLine;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFWriter;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFUtils;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.Options;
|
||||
import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter;
|
||||
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.OutputStream;
|
||||
import java.io.PrintStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.EnumSet;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A stub for routing and management of genotype reading and writing.
|
||||
|
|
@ -46,7 +50,9 @@ import java.util.Collection;
|
|||
* @author ebanks
|
||||
* @version 0.1
|
||||
*/
|
||||
public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
||||
public class VariantContextWriterStub implements Stub<VariantContextWriter>, VariantContextWriter {
|
||||
public final static boolean UPDATE_CONTIG_HEADERS = true;
|
||||
|
||||
/**
|
||||
* The engine, central to the GATK's processing.
|
||||
*/
|
||||
|
|
@ -83,7 +89,7 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
/**
|
||||
* Should the header be written out? A hidden argument.
|
||||
*/
|
||||
private final boolean skipWritingHeader;
|
||||
private final boolean skipWritingCommandLineHeader;
|
||||
|
||||
/**
|
||||
* Should we not write genotypes even when provided?
|
||||
|
|
@ -103,16 +109,16 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
* @param genotypeFile file to (ultimately) create.
|
||||
* @param isCompressed should we compress the output stream?
|
||||
* @param argumentSources sources.
|
||||
* @param skipWritingHeader skip writing header.
|
||||
* @param skipWritingCommandLineHeader skip writing header.
|
||||
* @param doNotWriteGenotypes do not write genotypes.
|
||||
*/
|
||||
public VCFWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) {
|
||||
public VariantContextWriterStub(GenomeAnalysisEngine engine, File genotypeFile, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingCommandLineHeader, boolean doNotWriteGenotypes) {
|
||||
this.engine = engine;
|
||||
this.genotypeFile = genotypeFile;
|
||||
this.genotypeStream = null;
|
||||
this.isCompressed = isCompressed;
|
||||
this.argumentSources = argumentSources;
|
||||
this.skipWritingHeader = skipWritingHeader;
|
||||
this.skipWritingCommandLineHeader = skipWritingCommandLineHeader;
|
||||
this.doNotWriteGenotypes = doNotWriteGenotypes;
|
||||
}
|
||||
|
||||
|
|
@ -123,16 +129,16 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
* @param genotypeStream stream to (ultimately) write.
|
||||
* @param isCompressed should we compress the output stream?
|
||||
* @param argumentSources sources.
|
||||
* @param skipWritingHeader skip writing header.
|
||||
* @param skipWritingCommandLineHeader skip writing header.
|
||||
* @param doNotWriteGenotypes do not write genotypes.
|
||||
*/
|
||||
public VCFWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingHeader, boolean doNotWriteGenotypes) {
|
||||
public VariantContextWriterStub(GenomeAnalysisEngine engine, OutputStream genotypeStream, boolean isCompressed, Collection<Object> argumentSources, boolean skipWritingCommandLineHeader, boolean doNotWriteGenotypes) {
|
||||
this.engine = engine;
|
||||
this.genotypeFile = null;
|
||||
this.genotypeStream = new PrintStream(genotypeStream);
|
||||
this.isCompressed = isCompressed;
|
||||
this.argumentSources = argumentSources;
|
||||
this.skipWritingHeader = skipWritingHeader;
|
||||
this.skipWritingCommandLineHeader = skipWritingCommandLineHeader;
|
||||
this.doNotWriteGenotypes = doNotWriteGenotypes;
|
||||
}
|
||||
|
||||
|
|
@ -169,12 +175,18 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
return engine.getMasterSequenceDictionary();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we tell the VCF writer not to write genotypes?
|
||||
* @return true if the writer should not write genotypes.
|
||||
*/
|
||||
public boolean doNotWriteGenotypes() {
|
||||
return doNotWriteGenotypes;
|
||||
public EnumSet<Options> getWriterOptions() {
|
||||
return getWriterOptions(false);
|
||||
}
|
||||
|
||||
public EnumSet<Options> getWriterOptions(boolean indexOnTheFly) {
|
||||
List<Options> options = new ArrayList<Options>();
|
||||
|
||||
if ( doNotWriteGenotypes ) options.add(Options.DO_NOT_WRITE_GENOTYPES);
|
||||
if ( engine.lenientVCFProcessing() ) options.add(Options.ALLOW_MISSING_FIELDS_IN_HEADER);
|
||||
if ( indexOnTheFly && ! isCompressed() ) options.add(Options.INDEX_ON_THE_FLY);
|
||||
|
||||
return options.isEmpty() ? EnumSet.noneOf(Options.class) : EnumSet.copyOf(options);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -196,26 +208,18 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
public void writeHeader(VCFHeader header) {
|
||||
vcfHeader = header;
|
||||
|
||||
// Check for the command-line argument header line. If not present, add it in.
|
||||
if (!skipWritingHeader && header.isWriteEngineHeaders()) {
|
||||
|
||||
if (header.isWriteCommandLine()) {
|
||||
VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine();
|
||||
boolean foundCommandLineHeaderLine = false;
|
||||
for (VCFHeaderLine line: vcfHeader.getMetaData()) {
|
||||
if ( line.getKey().equals(commandLineArgHeaderLine.getKey()) )
|
||||
foundCommandLineHeaderLine = true;
|
||||
}
|
||||
if ( !foundCommandLineHeaderLine )
|
||||
if ( header.isWriteEngineHeaders() ) {
|
||||
// skip writing the command line header if requested
|
||||
if ( ! skipWritingCommandLineHeader && header.isWriteCommandLine() ) {
|
||||
// Check for the command-line argument header line. If not present, add it in.
|
||||
final VCFHeaderLine commandLineArgHeaderLine = getCommandLineArgumentHeaderLine();
|
||||
final boolean foundCommandLineHeaderLine = vcfHeader.getMetaDataLine(commandLineArgHeaderLine.getKey()) != null;
|
||||
if ( ! foundCommandLineHeaderLine )
|
||||
vcfHeader.addMetaDataLine(commandLineArgHeaderLine);
|
||||
}
|
||||
|
||||
// also put in the reference contig header lines
|
||||
String assembly = getReferenceAssembly(engine.getArguments().referenceFile.getName());
|
||||
for ( SAMSequenceRecord contig : engine.getReferenceDataSource().getReference().getSequenceDictionary().getSequences() )
|
||||
vcfHeader.addMetaDataLine(getContigHeaderLine(contig, assembly));
|
||||
|
||||
vcfHeader.addMetaDataLine(new VCFHeaderLine(VCFHeader.REFERENCE_KEY, "file://" + engine.getArguments().referenceFile.getAbsolutePath()));
|
||||
if ( UPDATE_CONTIG_HEADERS )
|
||||
vcfHeader = VCFUtils.withUpdatedContigs(vcfHeader, engine);
|
||||
}
|
||||
|
||||
outputTracker.getStorage(this).writeHeader(vcfHeader);
|
||||
|
|
@ -244,6 +248,20 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
return getClass().getName();
|
||||
}
|
||||
|
||||
/**
|
||||
* Should we also write a BCF file alongside our VCF file for testing
|
||||
*
|
||||
* TODO -- remove me when argument generateShadowBCF is removed
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public boolean alsoWriteBCFForTest() {
|
||||
return engine.getArguments().numberOfThreads == 1 && // only works single threaded
|
||||
! isCompressed() && // for non-compressed outputs
|
||||
getFile() != null && // that are going to disk
|
||||
engine.getArguments().generateShadowBCF; // and we actually want to do it
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the appropriately formatted header for a VCF file
|
||||
* @return VCF file header.
|
||||
|
|
@ -252,27 +270,4 @@ public class VCFWriterStub implements Stub<VCFWriter>, VCFWriter {
|
|||
CommandLineExecutable executable = JVMUtils.getObjectOfType(argumentSources,CommandLineExecutable.class);
|
||||
return new VCFHeaderLine(executable.getAnalysisName(), "\"" + engine.createApproximateCommandLineArgumentString(argumentSources.toArray()) + "\"");
|
||||
}
|
||||
|
||||
private VCFHeaderLine getContigHeaderLine(SAMSequenceRecord contig, String assembly) {
|
||||
String val;
|
||||
if ( assembly != null )
|
||||
val = String.format("<ID=%s,length=%d,assembly=%s>", contig.getSequenceName(), contig.getSequenceLength(), assembly);
|
||||
else
|
||||
val = String.format("<ID=%s,length=%d>", contig.getSequenceName(), contig.getSequenceLength());
|
||||
return new VCFHeaderLine(VCFHeader.CONTIG_KEY, val);
|
||||
}
|
||||
|
||||
private String getReferenceAssembly(String refPath) {
|
||||
// This doesn't need to be perfect as it's not a required VCF header line, but we might as well give it a shot
|
||||
String assembly = null;
|
||||
if (refPath.contains("b37") || refPath.contains("v37"))
|
||||
assembly = "b37";
|
||||
else if (refPath.contains("b36"))
|
||||
assembly = "b36";
|
||||
else if (refPath.contains("hg18"))
|
||||
assembly = "hg18";
|
||||
else if (refPath.contains("hg19"))
|
||||
assembly = "hg19";
|
||||
return assembly;
|
||||
}
|
||||
}
|
||||
|
|
@ -40,9 +40,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
|||
import org.broadinstitute.sting.utils.MathUtils;
|
||||
import org.broadinstitute.sting.utils.ReservoirDownsampler;
|
||||
import org.broadinstitute.sting.utils.exceptions.UserException;
|
||||
import org.broadinstitute.sting.utils.pileup.ExtendedEventPileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.PileupElement;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedExtendedEventPileupImpl;
|
||||
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
|
||||
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
|
||||
import org.broadinstitute.sting.utils.sam.ReadUtils;
|
||||
|
|
@ -63,7 +61,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
// member fields
|
||||
//
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
private boolean hasExtendedEvents = false; // will be set to true if at least one read had an indel right before the current position
|
||||
|
||||
/**
|
||||
* Used to create new GenomeLocs.
|
||||
|
|
@ -92,26 +89,10 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
// stepForwardOnGenome(). The next call to stepForwardOnGenome() will clear that memory (as we remember only extended
|
||||
// events immediately preceding the current reference base).
|
||||
|
||||
boolean generateExtendedEvents = true; // should we generate an additional, special pile for indels between the ref bases?
|
||||
// the only purpose of this flag is to shield away a few additional lines of code
|
||||
// when extended piles are not needed, it may not be even worth it...
|
||||
|
||||
byte[] insertedBases = null; // remember full inserted sequence if we are generating piles of extended events (indels)
|
||||
int eventLength = -1; // will be set to the length of insertion/deletion if we are generating piles of extended events
|
||||
byte eventDelayedFlag = 0; // will be set to non-0 if there was an event (indel) right before the
|
||||
// current base on the ref. We use a counter-like variable here since clearing the indel event is
|
||||
// delayed by one base, so we need to remember how long ago we have seen the actual event
|
||||
|
||||
int eventStart = -1; // where on the read the extended event starts (i.e. the last position on the read prior to the
|
||||
// event, or -1 if alignment starts with an insertion); this one is easy to recompute on the fly,
|
||||
// we cache it here mainly for convenience
|
||||
|
||||
|
||||
public SAMRecordState(SAMRecord read, boolean extended) {
|
||||
public SAMRecordState(SAMRecord read) {
|
||||
this.read = read;
|
||||
cigar = read.getCigar();
|
||||
nCigarElements = cigar.numCigarElements();
|
||||
generateExtendedEvents = extended;
|
||||
|
||||
//System.out.printf("Creating a SAMRecordState: %s%n", this);
|
||||
}
|
||||
|
|
@ -150,27 +131,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
return curElement.getOperator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if we just stepped over insertion/into a deletion prior to the last return from stepForwardOnGenome.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public boolean hadIndel() {
|
||||
return (eventLength > 0);
|
||||
}
|
||||
|
||||
public int getEventLength() {
|
||||
return eventLength;
|
||||
}
|
||||
|
||||
public byte[] getEventBases() {
|
||||
return insertedBases;
|
||||
}
|
||||
|
||||
public int getReadEventStartOffset() {
|
||||
return eventStart;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.format("%s ro=%d go=%d co=%d cec=%d %s", read.getReadName(), readOffset, genomeOffset, cigarOffset, cigarElementCounter, curElement);
|
||||
}
|
||||
|
|
@ -208,19 +168,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
genomeOffset++; // extended events need that. Logically, it's legal to advance the genomic offset here:
|
||||
// we do step forward on the ref, and by returning null we also indicate that we are past the read end.
|
||||
|
||||
if (generateExtendedEvents && eventDelayedFlag > 0) {
|
||||
|
||||
// if we had an indel right before the read ended (i.e. insertion was the last cigar element),
|
||||
// we keep it until next reference base; then we discard it and this will allow the LocusIterator to
|
||||
// finally discard this read
|
||||
eventDelayedFlag--;
|
||||
if (eventDelayedFlag == 0) {
|
||||
eventLength = -1; // reset event when we are past it
|
||||
insertedBases = null;
|
||||
eventStart = -1;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -232,17 +179,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
cigarElementCounter = curElement.getLength();
|
||||
break;
|
||||
case I: // insertion w.r.t. the reference
|
||||
if (generateExtendedEvents) {
|
||||
// we see insertions only once, when we step right onto them; the position on the read is scrolled
|
||||
// past the insertion right after that
|
||||
if (eventDelayedFlag > 1)
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||
insertedBases = Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + curElement.getLength());
|
||||
eventLength = curElement.getLength();
|
||||
eventStart = readOffset;
|
||||
eventDelayedFlag = 2; // insertion causes re-entry into stepForwardOnGenome, so we set the delay to 2
|
||||
// System.out.println("Inserted "+(new String (insertedBases)) +" after "+readOffset);
|
||||
} // continue onto the 'S' case !
|
||||
case S: // soft clip
|
||||
cigarElementCounter = curElement.getLength();
|
||||
readOffset += curElement.getLength();
|
||||
|
|
@ -250,19 +186,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
case D: // deletion w.r.t. the reference
|
||||
if (readOffset < 0) // we don't want reads starting with deletion, this is a malformed cigar string
|
||||
throw new UserException.MalformedBAM(read, "Read starting with deletion. Cigar: " + read.getCigarString() + ". This is an indication of a malformed file, but the SAM spec allows reads starting in deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar");
|
||||
if (generateExtendedEvents) {
|
||||
if (cigarElementCounter == 1) {
|
||||
// generate an extended event only if we just stepped into the deletion (i.e. don't
|
||||
// generate the event at every deleted position on the ref, that's what cigarElementCounter==1 is for!)
|
||||
if (eventDelayedFlag > 1)
|
||||
throw new UserException.MalformedBAM(read, String.format("Adjacent I/D events in read %s -- cigar: %s. This is an indication of a malformed file, but the SAM spec allows reads with adjacent insertion/deletion. If you are sure you want to use this read, re-run your analysis with the extra option: -rf BadCigar", read.getReadName(), read.getCigarString()));
|
||||
eventLength = curElement.getLength();
|
||||
eventDelayedFlag = 2; // deletion on the ref causes an immediate return, so we have to delay by 1 only
|
||||
eventStart = readOffset;
|
||||
insertedBases = null;
|
||||
// System.out.println("Deleted "+eventLength +" bases after "+readOffset);
|
||||
}
|
||||
}
|
||||
// should be the same as N case
|
||||
genomeOffset++;
|
||||
done = true;
|
||||
|
|
@ -280,21 +203,6 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
throw new IllegalStateException("Case statement didn't deal with cigar op: " + curElement.getOperator());
|
||||
}
|
||||
|
||||
if (generateExtendedEvents) {
|
||||
if (eventDelayedFlag > 0 && done) {
|
||||
// if we did make a successful step on the ref, decrement delayed flag. If, upon the decrementing the,
|
||||
// the flag is 1, we are standing on the reference base right after the indel (so we have to keep it).
|
||||
// Otherwise, we are away from the previous indel and have to clear our memories...
|
||||
eventDelayedFlag--; // when we notice an indel, we set delayed flag to 2, so now
|
||||
// if eventDelayedFlag == 1, an indel occured right before the current base
|
||||
if (eventDelayedFlag == 0) {
|
||||
eventLength = -1; // reset event when we are past it
|
||||
insertedBases = null;
|
||||
eventStart = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return done ? curElement.getOperator() : stepForwardOnGenome();
|
||||
}
|
||||
}
|
||||
|
|
@ -374,147 +282,69 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
// this call will set hasExtendedEvents to true if it picks up a read with indel right before the current position on the ref:
|
||||
readStates.collectPendingReads();
|
||||
|
||||
int size = 0;
|
||||
int nDeletions = 0;
|
||||
int nInsertions = 0;
|
||||
int nMQ0Reads = 0;
|
||||
final GenomeLoc location = getLocation();
|
||||
final Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||
boolean hasBeenSampled = false;
|
||||
for (final String sample : samples) {
|
||||
final Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
final List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
|
||||
int size = 0; // number of elements in this sample's pileup
|
||||
int nDeletions = 0; // number of deletions in this sample's pileup
|
||||
int nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
|
||||
|
||||
// if extended events are requested, and if previous traversal step brought us over an indel in
|
||||
// at least one read, we emit extended pileup (making sure that it is associated with the previous base,
|
||||
// i.e. the one right *before* the indel) and do NOT shift the current position on the ref.
|
||||
// In this case, the subsequent call to next() will emit the normal pileup at the current base
|
||||
// and shift the position.
|
||||
if (readInfo.generateExtendedEvents() && hasExtendedEvents) {
|
||||
Map<String, ReadBackedExtendedEventPileupImpl> fullExtendedEventPileup = new HashMap<String, ReadBackedExtendedEventPileupImpl>();
|
||||
while (iterator.hasNext()) {
|
||||
final SAMRecordState state = iterator.next(); // state object with the read/offset information
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
|
||||
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
|
||||
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
|
||||
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
|
||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
|
||||
// get current location on the reference and decrement it by 1: the indels we just stepped over
|
||||
// are associated with the *previous* reference base
|
||||
GenomeLoc loc = genomeLocParser.incPos(getLocation(), -1);
|
||||
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
|
||||
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
|
||||
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
|
||||
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION;
|
||||
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
|
||||
|
||||
boolean hasBeenSampled = false;
|
||||
for (final String sample : samples) {
|
||||
Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
List<ExtendedEventPileupElement> indelPile = new ArrayList<ExtendedEventPileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= loc.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
int nextElementLength = nextElement.getLength();
|
||||
|
||||
size = 0;
|
||||
nDeletions = 0;
|
||||
nInsertions = 0;
|
||||
nMQ0Reads = 0;
|
||||
int maxDeletionLength = 0;
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
continue;
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
final SAMRecordState state = iterator.next();
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
final int eventStartOffset = state.getReadEventStartOffset(); // this will be -1 if base is not a deletion, or if base is the first deletion in the event. Otherwise, it will give the last base before the deletion began.
|
||||
final int eventLength = state.getEventLength();
|
||||
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
continue;
|
||||
|
||||
if (state.hadIndel()) { // this read has an indel associated with the previous position on the ref
|
||||
if (op == CigarOperator.D) {
|
||||
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
||||
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
|
||||
size++;
|
||||
ExtendedEventPileupElement pileupElement;
|
||||
if (state.getEventBases() == null) { // Deletion event
|
||||
nDeletions++;
|
||||
maxDeletionLength = Math.max(maxDeletionLength, state.getEventLength());
|
||||
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength);
|
||||
}
|
||||
else { // Insertion event
|
||||
nInsertions++;
|
||||
pileupElement = new ExtendedEventPileupElement(read, eventStartOffset, eventLength, state.getEventBases());
|
||||
}
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
|
||||
indelPile.add(pileupElement);
|
||||
}
|
||||
|
||||
// this read has no indel so add it to the pileup as a NOEVENT:
|
||||
// a deletion that didn't start here (therefore, not an extended event)
|
||||
// we add (mis)matches as no events.
|
||||
else if (op != CigarOperator.D || readInfo.includeReadsWithDeletionAtLoci()) {
|
||||
size++;
|
||||
indelPile.add(new ExtendedEventPileupElement((GATKSAMRecord) state.getRead(), readOffset));
|
||||
nDeletions++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (!filterBaseInRead(read, location.getStart())) {
|
||||
String insertedBaseString = null;
|
||||
if (nextOp == CigarOperator.I)
|
||||
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
|
||||
|
||||
if (indelPile.size() != 0)
|
||||
fullExtendedEventPileup.put(sample, new ReadBackedExtendedEventPileupImpl(loc, indelPile, size, maxDeletionLength, nInsertions, nDeletions, nMQ0Reads));
|
||||
}
|
||||
hasExtendedEvents = false; // we are done with extended events prior to current ref base
|
||||
nextAlignmentContext = new AlignmentContext(loc, new ReadBackedExtendedEventPileupImpl(loc, fullExtendedEventPileup), hasBeenSampled);
|
||||
}
|
||||
else { // this is a regular event pileup (not extended)
|
||||
GenomeLoc location = getLocation();
|
||||
Map<String, ReadBackedPileupImpl> fullPileup = new HashMap<String, ReadBackedPileupImpl>();
|
||||
boolean hasBeenSampled = false;
|
||||
for (final String sample : samples) {
|
||||
Iterator<SAMRecordState> iterator = readStates.iterator(sample);
|
||||
List<PileupElement> pile = new ArrayList<PileupElement>(readStates.size(sample));
|
||||
hasBeenSampled |= location.getStart() <= readStates.getDownsamplingExtent(sample);
|
||||
|
||||
size = 0; // number of elements in this sample's pileup
|
||||
nDeletions = 0; // number of deletions in this sample's pileup
|
||||
nMQ0Reads = 0; // number of MQ0 reads in this sample's pileup (warning: current implementation includes N bases that are MQ0)
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
final SAMRecordState state = iterator.next(); // state object with the read/offset information
|
||||
final GATKSAMRecord read = (GATKSAMRecord) state.getRead(); // the actual read
|
||||
final CigarOperator op = state.getCurrentCigarOperator(); // current cigar operator
|
||||
final CigarElement nextElement = state.peekForwardOnGenome(); // next cigar element
|
||||
final CigarElement lastElement = state.peekBackwardOnGenome(); // last cigar element
|
||||
final CigarOperator nextOp = nextElement.getOperator(); // next cigar operator
|
||||
final CigarOperator lastOp = lastElement.getOperator(); // last cigar operator
|
||||
final int readOffset = state.getReadOffset(); // the base offset on this read
|
||||
|
||||
final boolean isBeforeDeletion = nextOp == CigarOperator.DELETION;
|
||||
final boolean isAfterDeletion = lastOp == CigarOperator.DELETION;
|
||||
final boolean isBeforeInsertion = nextOp == CigarOperator.INSERTION;
|
||||
final boolean isAfterInsertion = lastOp == CigarOperator.INSERTION;
|
||||
final boolean isNextToSoftClip = nextOp == CigarOperator.S || (state.getGenomeOffset() == 0 && read.getSoftStart() != read.getAlignmentStart());
|
||||
|
||||
int nextElementLength = nextElement.getLength();
|
||||
|
||||
if (op == CigarOperator.N) // N's are never added to any pileup
|
||||
continue;
|
||||
|
||||
if (op == CigarOperator.D) {
|
||||
if (readInfo.includeReadsWithDeletionAtLoci()) { // only add deletions to the pileup if we are authorized to do so
|
||||
pile.add(new PileupElement(read, readOffset, true, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, null, nextOp == CigarOperator.D ? nextElementLength : -1));
|
||||
size++;
|
||||
nDeletions++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (!filterBaseInRead(read, location.getStart())) {
|
||||
String insertedBaseString = null;
|
||||
if (nextOp == CigarOperator.I)
|
||||
insertedBaseString = new String(Arrays.copyOfRange(read.getReadBases(), readOffset + 1, readOffset + 1 + nextElement.getLength()));
|
||||
|
||||
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
|
||||
size++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
}
|
||||
pile.add(new PileupElement(read, readOffset, false, isBeforeDeletion, isAfterDeletion, isBeforeInsertion, isAfterInsertion, isNextToSoftClip, insertedBaseString, nextElementLength));
|
||||
size++;
|
||||
if (read.getMappingQuality() == 0)
|
||||
nMQ0Reads++;
|
||||
}
|
||||
}
|
||||
|
||||
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
|
||||
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
|
||||
}
|
||||
|
||||
updateReadStates(); // critical - must be called after we get the current state offsets and location
|
||||
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
|
||||
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
|
||||
if (pile.size() != 0) // if this pileup added at least one base, add it to the full pileup
|
||||
fullPileup.put(sample, new ReadBackedPileupImpl(location, pile, size, nDeletions, nMQ0Reads));
|
||||
}
|
||||
|
||||
updateReadStates(); // critical - must be called after we get the current state offsets and location
|
||||
if (!fullPileup.isEmpty()) // if we got reads with non-D/N over the current position, we are done
|
||||
nextAlignmentContext = new AlignmentContext(location, new ReadBackedPileupImpl(location, fullPileup), hasBeenSampled);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -546,9 +376,7 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
while (it.hasNext()) {
|
||||
SAMRecordState state = it.next();
|
||||
CigarOperator op = state.stepForwardOnGenome();
|
||||
if (state.hadIndel() && readInfo.generateExtendedEvents())
|
||||
hasExtendedEvents = true;
|
||||
else if (op == null) {
|
||||
if (op == null) {
|
||||
// we discard the read only when we are past its end AND indel at the end of the read (if any) was
|
||||
// already processed. Keeping the read state that retunred null upon stepForwardOnGenome() is safe
|
||||
// as the next call to stepForwardOnGenome() will return null again AND will clear hadIndel() flag.
|
||||
|
|
@ -757,12 +585,9 @@ public class LocusIteratorByState extends LocusIterator {
|
|||
int readCount = 0;
|
||||
for (SAMRecord read : reads) {
|
||||
if (readCount < maxReads) {
|
||||
SAMRecordState state = new SAMRecordState(read, readInfo.generateExtendedEvents());
|
||||
SAMRecordState state = new SAMRecordState(read);
|
||||
state.stepForwardOnGenome();
|
||||
newReadStates.add(state);
|
||||
// TODO: What if we downsample the extended events away?
|
||||
if (state.hadIndel())
|
||||
hasExtendedEvents = true;
|
||||
readCount++;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,6 +52,7 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
|
||||
|
|
@ -84,6 +85,10 @@ public class GATKRunReport {
|
|||
*/
|
||||
private static File REPORT_SENTINEL = new File(REPORT_DIR.getAbsolutePath() + "/ENABLE");
|
||||
|
||||
// number of milliseconds before the S3 put operation is timed-out:
|
||||
private static final long S3PutTimeOut = 30 * 1000;
|
||||
|
||||
|
||||
/**
|
||||
* our log
|
||||
*/
|
||||
|
|
@ -263,6 +268,58 @@ public class GATKRunReport {
|
|||
}
|
||||
}
|
||||
|
||||
private class S3PutRunnable implements Runnable {
|
||||
|
||||
public AtomicBoolean isSuccess;
|
||||
private final String key;
|
||||
private final byte[] report;
|
||||
|
||||
public S3Object s3Object;
|
||||
public String errorMsg;
|
||||
public Throwable errorThrow;
|
||||
|
||||
public S3PutRunnable(String key, byte[] report){
|
||||
isSuccess = new AtomicBoolean();
|
||||
this.key = key;
|
||||
this.report = report;
|
||||
}
|
||||
|
||||
public void run() {
|
||||
try {
|
||||
// Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials
|
||||
// are stored in an AWSCredentials object:
|
||||
|
||||
// IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket
|
||||
String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user
|
||||
String awsSecretKey = "uQLTduhK6Gy8mbOycpoZIxr8ZoVj1SQaglTWjpYA"; // GATK AWS user
|
||||
AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey);
|
||||
|
||||
// To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP
|
||||
// implementation based on HttpClient, as this is the most robust implementation provided with JetS3t.
|
||||
S3Service s3Service = new RestS3Service(awsCredentials);
|
||||
|
||||
// Create an S3Object based on a file, with Content-Length set automatically and
|
||||
// Content-Type set based on the file's extension (using the Mimetypes utility class)
|
||||
S3Object fileObject = new S3Object(key, report);
|
||||
//logger.info("Created S3Object" + fileObject);
|
||||
//logger.info("Uploading " + localFile + " to AWS bucket");
|
||||
s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
|
||||
isSuccess.set(true);
|
||||
} catch ( S3ServiceException e ) {
|
||||
setException("S3 exception occurred", e);
|
||||
} catch ( NoSuchAlgorithmException e ) {
|
||||
setException("Couldn't calculate MD5", e);
|
||||
} catch ( IOException e ) {
|
||||
setException("Couldn't read report file", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void setException(String msg, Throwable e){
|
||||
errorMsg=msg;
|
||||
errorThrow=e;
|
||||
}
|
||||
}
|
||||
|
||||
private void postReportToAWSS3() {
|
||||
// modifying example code from http://jets3t.s3.amazonaws.com/toolkit/code-samples.html
|
||||
this.hostName = Utils.resolveHostname(); // we want to fill in the host name
|
||||
|
|
@ -280,32 +337,32 @@ public class GATKRunReport {
|
|||
Logger mimeTypeLogger = Logger.getLogger(org.jets3t.service.utils.Mimetypes.class);
|
||||
mimeTypeLogger.setLevel(Level.FATAL);
|
||||
|
||||
// Your Amazon Web Services (AWS) login credentials are required to manage S3 accounts. These credentials
|
||||
// are stored in an AWSCredentials object:
|
||||
// Set the S3 upload on its own thread with timeout:
|
||||
S3PutRunnable s3run = new S3PutRunnable(key,report);
|
||||
Thread s3thread = new Thread(s3run);
|
||||
s3thread.setDaemon(true);
|
||||
s3thread.setName("S3Put-Thread");
|
||||
s3thread.start();
|
||||
|
||||
// IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket
|
||||
String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user
|
||||
String awsSecretKey = "uQLTduhK6Gy8mbOycpoZIxr8ZoVj1SQaglTWjpYA"; // GATK AWS user
|
||||
AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey);
|
||||
s3thread.join(S3PutTimeOut);
|
||||
|
||||
// To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP
|
||||
// implementation based on HttpClient, as this is the most robust implementation provided with JetS3t.
|
||||
S3Service s3Service = new RestS3Service(awsCredentials);
|
||||
|
||||
// Create an S3Object based on a file, with Content-Length set automatically and
|
||||
// Content-Type set based on the file's extension (using the Mimetypes utility class)
|
||||
S3Object fileObject = new S3Object(key, report);
|
||||
//logger.info("Created S3Object" + fileObject);
|
||||
//logger.info("Uploading " + localFile + " to AWS bucket");
|
||||
S3Object s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject);
|
||||
logger.debug("Uploaded to AWS: " + s3Object);
|
||||
logger.info("Uploaded run statistics report to AWS S3");
|
||||
} catch ( S3ServiceException e ) {
|
||||
exceptDuringRunReport("S3 exception occurred", e);
|
||||
} catch ( NoSuchAlgorithmException e ) {
|
||||
exceptDuringRunReport("Couldn't calculate MD5", e);
|
||||
if(s3thread.isAlive()){
|
||||
s3thread.interrupt();
|
||||
exceptDuringRunReport("Run statistics report upload to AWS S3 timed-out");
|
||||
} else if(s3run.isSuccess.get()) {
|
||||
logger.info("Uploaded run statistics report to AWS S3");
|
||||
logger.debug("Uploaded to AWS: " + s3run.s3Object);
|
||||
} else {
|
||||
if((s3run.errorMsg != null) && (s3run.errorThrow != null)){
|
||||
exceptDuringRunReport(s3run.errorMsg,s3run.errorThrow);
|
||||
} else {
|
||||
exceptDuringRunReport("Run statistics report upload to AWS S3 failed");
|
||||
}
|
||||
}
|
||||
} catch ( IOException e ) {
|
||||
exceptDuringRunReport("Couldn't read report file", e);
|
||||
} catch ( InterruptedException e) {
|
||||
exceptDuringRunReport("Run statistics report upload interrupted", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser;
|
|||
* A HACK. Tribble should contain all the information in needs to decode the unqualified position of
|
||||
* a feature.
|
||||
*/
|
||||
public interface ReferenceDependentFeatureCodec<T extends org.broad.tribble.Feature> extends FeatureCodec<T> {
|
||||
public interface ReferenceDependentFeatureCodec {
|
||||
/**
|
||||
* Sets the appropriate GenomeLocParser, providing additional context when decoding larger and more variable features.
|
||||
* @param genomeLocParser The parser to supply.
|
||||
|
|
|
|||
|
|
@ -251,7 +251,7 @@ public class VariantContextAdaptors {
|
|||
|
||||
Map<String, Object> attributes = new HashMap<String, Object>();
|
||||
Collection<Genotype> genotypes = new ArrayList<Genotype>();
|
||||
Genotype call = new Genotype(name, genotypeAlleles);
|
||||
Genotype call = GenotypeBuilder.create(name, genotypeAlleles);
|
||||
|
||||
// add the call to the genotype list, and then use this list to create a VariantContext
|
||||
genotypes.add(call);
|
||||
|
|
@ -344,7 +344,7 @@ public class VariantContextAdaptors {
|
|||
alleles.add(allele2);
|
||||
}
|
||||
|
||||
Genotype g = new Genotype(samples[i], myAlleles);
|
||||
Genotype g = GenotypeBuilder.create(samples[i], myAlleles);
|
||||
genotypes.add(g);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -33,6 +33,9 @@ import org.broadinstitute.sting.gatk.refdata.ReferenceDependentFeatureCodec;
|
|||
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
|
||||
import org.broadinstitute.sting.utils.GenomeLocParser;
|
||||
import org.broadinstitute.sting.utils.classloader.PluginManager;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.AbstractVCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec;
|
||||
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
|
||||
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
|
||||
import org.broadinstitute.sting.utils.help.GATKDocUtils;
|
||||
|
||||
|
|
@ -82,11 +85,19 @@ public class FeatureManager {
|
|||
|
||||
private final PluginManager<FeatureCodec> pluginManager;
|
||||
private final Collection<FeatureDescriptor> featureDescriptors = new TreeSet<FeatureDescriptor>();
|
||||
private final VCFHeader headerForRepairs;
|
||||
private final boolean lenientVCFProcessing;
|
||||
|
||||
/**
|
||||
* Construct a FeatureManager
|
||||
* Construct a FeatureManager without a master VCF header
|
||||
*/
|
||||
public FeatureManager() {
|
||||
this(null, false);
|
||||
}
|
||||
|
||||
public FeatureManager(final VCFHeader headerForRepairs, final boolean lenientVCFProcessing) {
|
||||
this.headerForRepairs = headerForRepairs;
|
||||
this.lenientVCFProcessing = lenientVCFProcessing;
|
||||
pluginManager = new PluginManager<FeatureCodec>(FeatureCodec.class, "Codecs", "Codec");
|
||||
|
||||
for (final String rawName: pluginManager.getPluginsByName().keySet()) {
|
||||
|
|
@ -244,6 +255,11 @@ public class FeatureManager {
|
|||
((NameAwareCodec)codex).setName(name);
|
||||
if ( codex instanceof ReferenceDependentFeatureCodec )
|
||||
((ReferenceDependentFeatureCodec)codex).setGenomeLocParser(genomeLocParser);
|
||||
if ( codex instanceof VCFCodec )
|
||||
((VCFCodec)codex).setHeaderForRepairs(headerForRepairs);
|
||||
if ( codex instanceof AbstractVCFCodec && lenientVCFProcessing )
|
||||
((AbstractVCFCodec)codex).disableOnTheFlyModifications();
|
||||
|
||||
return codex;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -26,11 +26,10 @@ package org.broadinstitute.sting.gatk.refdata.tracks;
|
|||
import net.sf.samtools.SAMSequenceDictionary;
|
||||
import net.sf.samtools.util.CloseableIterator;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.broad.tribble.AbstractFeatureReader;
|
||||
import org.broad.tribble.CloseableTribbleIterator;
|
||||
import org.broad.tribble.Feature;
|
||||
import org.broad.tribble.FeatureCodec;
|
||||
import org.broad.tribble.FeatureSource;
|
||||
import org.broad.tribble.iterators.CloseableTribbleIterator;
|
||||
import org.broad.tribble.source.PerformanceLoggingFeatureSource;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.FeatureToGATKFeatureIterator;
|
||||
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
|
@ -57,7 +56,7 @@ public class RMDTrack {
|
|||
private final File file; // the associated file we create the reader from
|
||||
|
||||
// our feature reader - allows queries
|
||||
private FeatureSource reader;
|
||||
private AbstractFeatureReader reader;
|
||||
|
||||
// our sequence dictionary, which can be null
|
||||
private final SAMSequenceDictionary dictionary;
|
||||
|
|
@ -92,7 +91,7 @@ public class RMDTrack {
|
|||
* @param dict the sam sequence dictionary
|
||||
* @param codec the feature codec we use to decode this type
|
||||
*/
|
||||
public RMDTrack(Class type, String name, File file, FeatureSource reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) {
|
||||
public RMDTrack(Class type, String name, File file, AbstractFeatureReader reader, SAMSequenceDictionary dict, GenomeLocParser genomeLocParser, FeatureCodec codec) {
|
||||
this.type = type;
|
||||
this.name = name;
|
||||
this.file = file;
|
||||
|
|
@ -116,8 +115,6 @@ public class RMDTrack {
|
|||
|
||||
public CloseableIterator<GATKFeature> query(GenomeLoc interval) throws IOException {
|
||||
CloseableTribbleIterator<Feature> iter = reader.query(interval.getContig(),interval.getStart(),interval.getStop());
|
||||
if ( RMDTrackBuilder.MEASURE_TRIBBLE_QUERY_PERFORMANCE )
|
||||
logger.warn("Query " + getName() + ":" + ((PerformanceLoggingFeatureSource)reader).getPerformanceLog());
|
||||
return new FeatureToGATKFeatureIterator(genomeLocParser, iter, this.getName());
|
||||
}
|
||||
|
||||
|
|
@ -130,10 +127,6 @@ public class RMDTrack {
|
|||
reader = null;
|
||||
}
|
||||
|
||||
public FeatureSource getReader() {
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
* get the sequence dictionary from the track, if available
|
||||
* @return a SAMSequenceDictionary if available, null if unavailable
|
||||
|
|
|
|||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue