Merge remote-tracking branch 'unstable/master'

This commit is contained in:
Eric Banks 2012-10-31 12:17:24 -04:00
commit 0a56fe5bc3
448 changed files with 23355 additions and 8546 deletions

5
.gitignore vendored
View File

@ -18,3 +18,8 @@ queueScatterGather
/bar* /bar*
integrationtests/ integrationtests/
public/testdata/onTheFlyOutputTest.vcf public/testdata/onTheFlyOutputTest.vcf
build/
dist/
dump/
lib/
out/

View File

@ -22,7 +22,9 @@
~ OTHER DEALINGS IN THE SOFTWARE. ~ OTHER DEALINGS IN THE SOFTWARE.
--> -->
<project name="Sting" default="dist" basedir="." xmlns:ivy="antlib:org.apache.ivy.ant"> <project name="Sting" default="dist" basedir="."
xmlns:artifact="antlib:org.apache.maven.artifact.ant"
xmlns:ivy="antlib:org.apache.ivy.ant">
<description>Compile and distribute the Sting toolkit</description> <description>Compile and distribute the Sting toolkit</description>
<!-- ******************************************************************************** --> <!-- ******************************************************************************** -->
@ -250,11 +252,14 @@
<property name="ivy.jar.file" value="ivy-${ivy.install.version}.jar"/> <property name="ivy.jar.file" value="ivy-${ivy.install.version}.jar"/>
<property name="ivy.settings.dir" value="settings"/> <property name="ivy.settings.dir" value="settings"/>
<property file="${ivy.settings.dir}/ivysettings.properties"/> <property file="${ivy.settings.dir}/ivysettings.properties"/>
<property name="maven-ant-tasks.install.version" value="2.1.3"/>
<property name="maven-ant-tasks.jar.file" value="maven-ant-tasks-${maven-ant-tasks.install.version}.jar"/>
<mkdir dir="${lib.dir}"/> <mkdir dir="${lib.dir}"/>
<mkdir dir="${ivy.jar.dir}"/> <mkdir dir="${ivy.jar.dir}"/>
<!-- Comment out the following two lines to build the GATK without a network connection, assuming you have all of the libraries cached already --> <!-- Comment out the following lines to build the GATK without a network connection, assuming you have all of the libraries cached already -->
<get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/${ivy.jar.file}" <get src="http://repo1.maven.org/maven2/org/apache/ivy/ivy/${ivy.install.version}/${ivy.jar.file}"
dest="${ivy.jar.dir}/${ivy.jar.file}" dest="${ivy.jar.dir}/${ivy.jar.file}"
usetimestamp="true"/> usetimestamp="true"/>
@ -262,6 +267,15 @@
uri="antlib:org.apache.ivy.ant" uri="antlib:org.apache.ivy.ant"
classpath="${ivy.jar.dir}/${ivy.jar.file}"/> classpath="${ivy.jar.dir}/${ivy.jar.file}"/>
<get src="http://repo1.maven.org/maven2/org/apache/maven/maven-ant-tasks/${maven-ant-tasks.install.version}/${maven-ant-tasks.jar.file}"
dest="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"
usetimestamp="true"/>
<taskdef resource="org/apache/maven/artifact/ant/antlib.xml"
uri="antlib:antlib:org.apache.maven.artifact.ant"
classpath="${ivy.jar.dir}/${maven-ant-tasks.jar.file}"/>
<!-- End network lines -->
<ivy:settings file="${ivy.settings.dir}/ivysettings.xml"/> <ivy:settings file="${ivy.settings.dir}/ivysettings.xml"/>
<property name="init.resolve.done" value="true"/> <property name="init.resolve.done" value="true"/>
</target> </target>
@ -295,7 +309,7 @@
<target name="git.rev-parse" depends="git.describe" unless="git.describe.succeeded"> <target name="git.rev-parse" depends="git.describe" unless="git.describe.succeeded">
<exec executable="git" outputproperty="git.rev-parse.output" resultproperty="git.rev-parse.exit.value" failonerror="false"> <exec executable="git" outputproperty="git.rev-parse.output" resultproperty="git.rev-parse.exit.value" failonerror="false">
<arg line="rev-parse HEAD" /> <arg line="rev-parse --short HEAD" />
</exec> </exec>
<condition property="git.rev-parse.succeeded"> <condition property="git.rev-parse.succeeded">
<equals arg1="${git.rev-parse.exit.value}" arg2="0" /> <equals arg1="${git.rev-parse.exit.value}" arg2="0" />
@ -577,6 +591,7 @@
docletpathref="doclet.classpath" docletpathref="doclet.classpath"
classpathref="external.dependencies" classpathref="external.dependencies"
classpath="${java.classes}" classpath="${java.classes}"
maxmemory="2g"
additionalparam="-build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet"> additionalparam="-build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -out ${basedir}/${resource.path} -quiet">
<sourcefiles> <sourcefiles>
<union> <union>
@ -780,6 +795,7 @@
docletpathref="doclet.classpath" docletpathref="doclet.classpath"
classpathref="external.dependencies" classpathref="external.dependencies"
classpath="${java.classes}" classpath="${java.classes}"
maxmemory="2g"
additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -quiet"> <!-- -test to only do DocumentationTest walker --> additionalparam="${gatkdocs.include.hidden.arg} -private -build-timestamp &quot;${build.timestamp}&quot; -absolute-version ${build.version} -quiet"> <!-- -test to only do DocumentationTest walker -->
<sourcefiles> <sourcefiles>
<fileset refid="java.source.files"/> <fileset refid="java.source.files"/>
@ -940,6 +956,28 @@
</ant> </ant>
</target> </target>
<!-- Maven install a package consisting of all supporting files. Don't call this target directly. Call one of the specific packaging targets below -->
<target name="mvninstall" depends="package" description="maven install a package into .m2/repository">
<property name="mvn.build.version" value="0.0.1" />
<!--
We should use the build version or better yet a git tag version, but tags are currently missing. Alternatively how do we then depend on the LATEST?
<property name="mvn.build.version" value="${build.version}" />
-->
<artifact:pom id="${executable}.pom" groupId="org.broadinstitute.sting" artifactId="${executable}" version="${mvn.build.version}" name="${executable}" />
<artifact:writepom pomRefId="${executable}.pom" file="${package.output.dir}/${executable}-${build.version}/${executable}.pom.xml"/>
<artifact:install file="${package.output.dir}/${executable}-${build.version}/${executable}.jar">
<artifact:pom file="${package.output.dir}/${executable}-${build.version}/${executable}.pom.xml" />
</artifact:install>
</target>
<!-- Maven install specific versions of the GATK/Queue. ALWAYS do an ant clean before invoking these! -->
<target name="mvninstall.gatk.full" depends="package.gatk.full,mvninstall" />
<target name="mvninstall.gatk.lite" depends="package.gatk.lite,mvninstall" />
<target name="mvninstall.queue.full" depends="package.queue.full,mvninstall" />
<target name="mvninstall.queue.lite" depends="package.queue.lite,mvninstall" />
<!-- ******************************************************************************** --> <!-- ******************************************************************************** -->
<!-- Clean --> <!-- Clean -->
@ -1177,7 +1215,7 @@
<!-- copy the report to our private_html directory for easy viewing in a broswer --> <!-- copy the report to our private_html directory for easy viewing in a broswer -->
<mkdir dir="${iwww.report.dir}/@{testtype}"/> <mkdir dir="${iwww.report.dir}/@{testtype}"/>
<copy todir="${iwww.report.dir}/@{testtype}" verbose="true"> <copy todir="${iwww.report.dir}/@{testtype}" verbose="false">
<fileset dir="@{outputdir}"/> <fileset dir="@{outputdir}"/>
</copy> </copy>

Binary file not shown.

View File

@ -46,7 +46,8 @@
<dependency org="org.apache.bcel" name="bcel" rev="5.2"/> <dependency org="org.apache.bcel" name="bcel" rev="5.2"/>
<!-- Dependencies for reflections mvn repository --> <!-- Dependencies for reflections mvn repository -->
<dependency org="org.reflections" name="reflections" rev="0.9.5-RC2"/> <dependency org="org.reflections" name="reflections" rev="0.9.8"/>
<dependency org="org.slf4j" name="slf4j-log4j12" rev="1.6.1"/>
<!-- Matrix package from math.nist.gov --> <!-- Matrix package from math.nist.gov -->
<dependency org="gov.nist" name="Jama" rev="1.0.2"/> <dependency org="gov.nist" name="Jama" rev="1.0.2"/>
@ -78,8 +79,8 @@
<dependency org="net.sf.gridscheduler" name="drmaa" rev="latest.integration"/> <dependency org="net.sf.gridscheduler" name="drmaa" rev="latest.integration"/>
<!-- Scala dependancies --> <!-- Scala dependancies -->
<dependency org="org.scala-lang" name="scala-compiler" rev="2.8.1"/> <dependency org="org.scala-lang" name="scala-compiler" rev="2.9.2"/>
<dependency org="org.scala-lang" name="scala-library" rev="2.8.1"/> <dependency org="org.scala-lang" name="scala-library" rev="2.9.2"/>
<!-- testing and evaluation dependencies --> <!-- testing and evaluation dependencies -->
<dependency org="org.testng" name="testng" rev="5.14.1" conf="test"/> <dependency org="org.testng" name="testng" rev="5.14.1" conf="test"/>

View File

@ -0,0 +1,197 @@
/*
* Copyright (c) 2010.
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import net.sf.samtools.SAMReadGroupRecord;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.pileup.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.io.PrintStream;
import java.util.*;
public class AlleleBiasedDownsamplingUtils {
/**
* Computes an allele biased version of the given pileup
*
* @param pileup the original pileup
* @param downsamplingFraction the fraction of total reads to remove per allele
* @param log logging output
* @return allele biased pileup
*/
public static ReadBackedPileup createAlleleBiasedBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
// special case removal of all or no reads
if ( downsamplingFraction <= 0.0 )
return pileup;
if ( downsamplingFraction >= 1.0 )
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>());
final ArrayList<PileupElement>[] alleleStratifiedElements = new ArrayList[4];
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i] = new ArrayList<PileupElement>();
// start by stratifying the reads by the alleles they represent at this position
for( final PileupElement pe : pileup ) {
// abort if we have a reduced read - we do not want to remove it!
if ( pe.getRead().isReducedRead() )
return pileup;
final int baseIndex = BaseUtils.simpleBaseToBaseIndex(pe.getBase());
if ( baseIndex != -1 )
alleleStratifiedElements[baseIndex].add(pe);
}
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
// Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later.
int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor
final TreeSet<PileupElement> elementsToKeep = new TreeSet<PileupElement>(new Comparator<PileupElement>() {
@Override
public int compare(PileupElement element1, PileupElement element2) {
final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart();
return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName());
}
});
for ( int i = 0; i < 4; i++ ) {
final ArrayList<PileupElement> alleleList = alleleStratifiedElements[i];
if ( alleleList.size() <= numReadsToRemove )
logAllElements(alleleList, log);
else
elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log));
}
// clean up pointers so memory can be garbage collected if needed
for ( int i = 0; i < 4; i++ )
alleleStratifiedElements[i].clear();
return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList<PileupElement>(elementsToKeep));
}
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to keep
*
* @param elements original list of records
* @param numElementsToRemove the number of records to remove
* @param log logging output
* @return the list of pileup elements TO KEEP
*/
private static List<PileupElement> downsampleElements(final ArrayList<PileupElement> elements, final int numElementsToRemove, final PrintStream log) {
final int pileupSize = elements.size();
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
ArrayList<PileupElement> elementsToKeep = new ArrayList<PileupElement>(pileupSize - numElementsToRemove);
for ( int i = 0; i < pileupSize; i++ ) {
if ( itemsToRemove.get(i) )
logRead(elements.get(i).getRead(), log);
else
elementsToKeep.add(elements.get(i));
}
return elementsToKeep;
}
/**
* Computes reads to remove based on an allele biased down-sampling
*
* @param alleleReadMap original list of records per allele
* @param downsamplingFraction the fraction of total reads to remove per allele
* @param log logging output
* @return list of reads TO REMOVE from allele biased down-sampling
*/
public static List<GATKSAMRecord> selectAlleleBiasedReads(final Map<Allele, List<GATKSAMRecord>> alleleReadMap, final double downsamplingFraction, final PrintStream log) {
int totalReads = 0;
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() )
totalReads += reads.size();
// Down-sample *each* allele by the contamination fraction applied to the entire pileup.
int numReadsToRemove = (int)(totalReads * downsamplingFraction);
final List<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(numReadsToRemove * alleleReadMap.size());
for ( final List<GATKSAMRecord> reads : alleleReadMap.values() ) {
if ( reads.size() <= numReadsToRemove ) {
readsToRemove.addAll(reads);
logAllReads(reads, log);
} else {
readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log));
}
}
return readsToRemove;
}
/**
* Performs allele biased down-sampling on a pileup and computes the list of elements to remove
*
* @param reads original list of records
* @param numElementsToRemove the number of records to remove
* @param log logging output
* @return the list of pileup elements TO REMOVE
*/
private static List<GATKSAMRecord> downsampleReads(final List<GATKSAMRecord> reads, final int numElementsToRemove, final PrintStream log) {
final int pileupSize = reads.size();
final BitSet itemsToRemove = new BitSet(pileupSize);
for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) {
itemsToRemove.set(selectedIndex);
}
ArrayList<GATKSAMRecord> readsToRemove = new ArrayList<GATKSAMRecord>(pileupSize - numElementsToRemove);
for ( int i = 0; i < pileupSize; i++ ) {
if ( itemsToRemove.get(i) ) {
final GATKSAMRecord read = reads.get(i);
readsToRemove.add(read);
logRead(read, log);
}
}
return readsToRemove;
}
private static void logAllElements(final List<PileupElement> elements, final PrintStream log) {
if ( log != null ) {
for ( final PileupElement p : elements )
logRead(p.getRead(), log);
}
}
private static void logAllReads(final List<GATKSAMRecord> reads, final PrintStream log) {
if ( log != null ) {
for ( final GATKSAMRecord read : reads )
logRead(read, log);
}
}
private static void logRead(final SAMRecord read, final PrintStream log) {
if ( log != null ) {
final SAMReadGroupRecord readGroup = read.getReadGroup();
log.println(String.format("%s\t%s\t%s\t%s", read.getReadName(), readGroup.getSample(), readGroup.getLibrary(), readGroup.getPlatformUnit()));
}
}
}

View File

@ -28,75 +28,56 @@ package org.broadinstitute.sting.gatk.walkers.bqsr;
import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
import org.broadinstitute.sting.utils.collections.NestedIntegerArray;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.recalibration.ReadCovariates;
import org.broadinstitute.sting.utils.recalibration.RecalDatum;
import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.threading.ThreadLocalArray;
public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource {
// optimizations: don't reallocate an array each time // optimization: only allocate temp arrays once per thread
private byte[] tempQualArray; private final ThreadLocal<byte[]> threadLocalTempQualArray = new ThreadLocalArray<byte[]>(EventType.values().length, byte.class);
private boolean[] tempErrorArray; private final ThreadLocal<double[]> threadLocalTempFractionalErrorArray = new ThreadLocalArray<double[]>(EventType.values().length, double.class);
public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) {
super.initialize(covariates, recalibrationTables); super.initialize(covariates, recalibrationTables);
tempQualArray = new byte[EventType.values().length];
tempErrorArray = new boolean[EventType.values().length];
} }
/** @Override
* Loop through the list of requested covariates and pick out the value from the read, offset, and reference public void updateDataForRead(final GATKSAMRecord read, final boolean[] skip, final double[] snpErrors, final double[] insertionErrors, final double[] deletionErrors ) {
* Using the list of covariate values as a key, pick out the RecalDatum and increment, for( int offset = 0; offset < read.getReadBases().length; offset++ ) {
* adding one to the number of observations and potentially one to the number of mismatches for all three if( !skip[offset] ) {
* categories (mismatches, insertions and deletions). final ReadCovariates readCovariates = covariateKeySetFrom(read);
*
* @param pileupElement The pileup element to update
* @param refBase The reference base at this locus
*/
public synchronized void updateDataForPileupElement(final PileupElement pileupElement, final byte refBase) {
final int offset = pileupElement.getOffset();
final ReadCovariates readCovariates = covariateKeySetFrom(pileupElement.getRead());
tempQualArray[EventType.BASE_SUBSTITUTION.index] = pileupElement.getQual(); byte[] tempQualArray = threadLocalTempQualArray.get();
tempErrorArray[EventType.BASE_SUBSTITUTION.index] = !BaseUtils.basesAreEqual(pileupElement.getBase(), refBase); double[] tempFractionalErrorArray = threadLocalTempFractionalErrorArray.get();
tempQualArray[EventType.BASE_INSERTION.index] = pileupElement.getBaseInsertionQual();
tempErrorArray[EventType.BASE_INSERTION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterInsertion() : pileupElement.isBeforeInsertion();
tempQualArray[EventType.BASE_DELETION.index] = pileupElement.getBaseDeletionQual();
tempErrorArray[EventType.BASE_DELETION.index] = (pileupElement.getRead().getReadNegativeStrandFlag()) ? pileupElement.isAfterDeletedBase() : pileupElement.isBeforeDeletedBase();
for (final EventType eventType : EventType.values()) { tempQualArray[EventType.BASE_SUBSTITUTION.index] = read.getBaseQualities()[offset];
final int[] keys = readCovariates.getKeySet(offset, eventType); tempFractionalErrorArray[EventType.BASE_SUBSTITUTION.index] = snpErrors[offset];
final int eventIndex = eventType.index; tempQualArray[EventType.BASE_INSERTION.index] = read.getBaseInsertionQualities()[offset];
final byte qual = tempQualArray[eventIndex]; tempFractionalErrorArray[EventType.BASE_INSERTION.index] = insertionErrors[offset];
final boolean isError = tempErrorArray[eventIndex]; tempQualArray[EventType.BASE_DELETION.index] = read.getBaseDeletionQualities()[offset];
tempFractionalErrorArray[EventType.BASE_DELETION.index] = deletionErrors[offset];
final NestedIntegerArray<RecalDatum> rgRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE); for (final EventType eventType : EventType.values()) {
final RecalDatum rgPreviousDatum = rgRecalTable.get(keys[0], eventIndex); final int[] keys = readCovariates.getKeySet(offset, eventType);
final RecalDatum rgThisDatum = createDatumObject(qual, isError); final int eventIndex = eventType.index;
if (rgPreviousDatum == null) // key doesn't exist yet in the map so make a new bucket and add it final byte qual = tempQualArray[eventIndex];
rgRecalTable.put(rgThisDatum, keys[0], eventIndex); final double isError = tempFractionalErrorArray[eventIndex];
else
rgPreviousDatum.combine(rgThisDatum);
final NestedIntegerArray<RecalDatum> qualRecalTable = recalibrationTables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE); combineDatumOrPutIfNecessary(recalibrationTables.getReadGroupTable(), qual, isError, keys[0], eventIndex);
final RecalDatum qualPreviousDatum = qualRecalTable.get(keys[0], keys[1], eventIndex);
if (qualPreviousDatum == null)
qualRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], eventIndex);
else
qualPreviousDatum.increment(isError);
for (int i = 2; i < covariates.length; i++) { incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventIndex);
if (keys[i] < 0)
continue; for (int i = 2; i < covariates.length; i++) {
final NestedIntegerArray<RecalDatum> covRecalTable = recalibrationTables.getTable(i); if (keys[i] < 0)
final RecalDatum covPreviousDatum = covRecalTable.get(keys[0], keys[1], keys[i], eventIndex); continue;
if (covPreviousDatum == null)
covRecalTable.put(createDatumObject(qual, isError), keys[0], keys[1], keys[i], eventIndex); incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex);
else }
covPreviousDatum.increment(isError); }
} }
} }
} }

View File

@ -1,8 +1,5 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads; package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import java.util.HashMap;
import java.util.Map;
/** /**
* An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base. * An object that keeps track of the base counts as well as the sum of the base, insertion and deletion qualities of each base.
* *
@ -10,47 +7,42 @@ import java.util.Map;
* @since 6/15/12 * @since 6/15/12
*/ */
public class BaseAndQualsCounts extends BaseCounts { public class BaseAndQualsCounts extends BaseCounts {
private final Map<BaseIndex, Long> sumInsertionQuals; private final long[] sumInsertionQuals;
private final Map<BaseIndex, Long> sumDeletionQuals; private final long[] sumDeletionQuals;
public BaseAndQualsCounts() { public BaseAndQualsCounts() {
super(); super();
this.sumInsertionQuals = new HashMap<BaseIndex, Long>(); this.sumInsertionQuals = new long[BaseIndex.values().length];
this.sumDeletionQuals = new HashMap<BaseIndex, Long>(); this.sumDeletionQuals = new long[BaseIndex.values().length];
for (BaseIndex i : BaseIndex.values()) { for (final BaseIndex i : BaseIndex.values()) {
sumInsertionQuals.put(i, 0L); sumInsertionQuals[i.index] = 0L;
sumDeletionQuals.put(i, 0L); sumDeletionQuals[i.index] = 0L;
} }
} }
public void incr(byte base, byte baseQual, byte insQual, byte delQual) { public void incr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
super.incr(base, baseQual); final BaseIndex i = BaseIndex.byteToBase(base);
BaseIndex i = BaseIndex.byteToBase(base); super.incr(i, baseQual);
if (i != null) { // do not allow Ns sumInsertionQuals[i.index] += insQual;
sumInsertionQuals.put(i, sumInsertionQuals.get(i) + insQual); sumDeletionQuals[i.index] += delQual;
sumDeletionQuals.put(i, sumDeletionQuals.get(i) + delQual);
}
} }
public void decr(byte base, byte baseQual, byte insQual, byte delQual) { public void decr(final byte base, final byte baseQual, final byte insQual, final byte delQual) {
super.decr(base, baseQual); final BaseIndex i = BaseIndex.byteToBase(base);
BaseIndex i = BaseIndex.byteToBase(base); super.decr(i, baseQual);
if (i != null) { // do not allow Ns sumInsertionQuals[i.index] -= insQual;
sumInsertionQuals.put(i, sumInsertionQuals.get(i) - insQual); sumDeletionQuals[i.index] -= delQual;
sumDeletionQuals.put(i, sumDeletionQuals.get(i) - delQual);
}
} }
public byte averageInsertionQualsOfMostCommonBase() { public byte averageInsertionQualsOfBase(final BaseIndex base) {
return getGenericAverageQualOfMostCommonBase(sumInsertionQuals); return getGenericAverageQualOfBase(base, sumInsertionQuals);
} }
public byte averageDeletionQualsOfMostCommonBase() { public byte averageDeletionQualsOfBase(final BaseIndex base) {
return getGenericAverageQualOfMostCommonBase(sumDeletionQuals); return getGenericAverageQualOfBase(base, sumDeletionQuals);
} }
private byte getGenericAverageQualOfMostCommonBase(Map<BaseIndex, Long> sumQuals) { private byte getGenericAverageQualOfBase(final BaseIndex base, final long[] sumQuals) {
BaseIndex base = BaseIndex.byteToBase(baseWithMostCounts()); return (byte) (sumQuals[base.index] / countOfBase(base));
return (byte) (sumQuals.get(base) / getCount(base));
} }
} }

View File

@ -3,11 +3,9 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import java.util.EnumMap;
import java.util.Map;
/** /**
* An object to keep track of the number of occurences of each base and it's quality. * An object to keep track of the number of occurrences of each base and it's quality.
* *
* User: depristo * User: depristo
* Date: 4/8/11 * Date: 4/8/11
@ -18,206 +16,225 @@ import java.util.Map;
public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N; public final static BaseIndex MAX_BASE_INDEX_WITH_NO_COUNTS = BaseIndex.N;
public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte(); public final static byte MAX_BASE_WITH_NO_COUNTS = MAX_BASE_INDEX_WITH_NO_COUNTS.getByte();
private final Map<BaseIndex, Integer> counts; // keeps track of the base counts private final int[] counts; // keeps track of the base counts
private final Map<BaseIndex, Long> sumQuals; // keeps track of the quals of each base private final long[] sumQuals; // keeps track of the quals of each base
private int totalCount = 0; // keeps track of total count since this is requested so often
public BaseCounts() { public BaseCounts() {
counts = new EnumMap<BaseIndex, Integer>(BaseIndex.class); counts = new int[BaseIndex.values().length];
sumQuals = new EnumMap<BaseIndex, Long>(BaseIndex.class); sumQuals = new long[BaseIndex.values().length];
for (BaseIndex i : BaseIndex.values()) { for (final BaseIndex i : BaseIndex.values()) {
counts.put(i, 0); counts[i.index] = 0;
sumQuals.put(i, 0L); sumQuals[i.index] = 0L;
} }
} }
public static BaseCounts createWithCounts(int[] countsACGT) { public static BaseCounts createWithCounts(int[] countsACGT) {
BaseCounts baseCounts = new BaseCounts(); BaseCounts baseCounts = new BaseCounts();
baseCounts.counts.put(BaseIndex.A, countsACGT[0]); baseCounts.counts[BaseIndex.A.index] = countsACGT[0];
baseCounts.counts.put(BaseIndex.C, countsACGT[1]); baseCounts.counts[BaseIndex.C.index] = countsACGT[1];
baseCounts.counts.put(BaseIndex.G, countsACGT[2]); baseCounts.counts[BaseIndex.G.index] = countsACGT[2];
baseCounts.counts.put(BaseIndex.T, countsACGT[3]); baseCounts.counts[BaseIndex.T.index] = countsACGT[3];
baseCounts.totalCount = countsACGT[0] + countsACGT[1] + countsACGT[2] + countsACGT[3];
return baseCounts; return baseCounts;
} }
@Requires("other != null") @Requires("other != null")
public void add(BaseCounts other) { public void add(final BaseCounts other) {
for (BaseIndex i : BaseIndex.values()) for (final BaseIndex i : BaseIndex.values()) {
counts.put(i, counts.get(i) + other.counts.get(i)); final int otherCount = other.counts[i.index];
counts[i.index] += otherCount;
totalCount += otherCount;
}
} }
@Requires("other != null") @Requires("other != null")
public void sub(BaseCounts other) { public void sub(final BaseCounts other) {
for (BaseIndex i : BaseIndex.values()) for (final BaseIndex i : BaseIndex.values()) {
counts.put(i, counts.get(i) - other.counts.get(i)); final int otherCount = other.counts[i.index];
} counts[i.index] -= otherCount;
totalCount -= otherCount;
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base) {
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns
counts.put(i, counts.get(i) + 1);
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(byte base, byte qual) {
BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns
counts.put(i, counts.get(i) + 1);
sumQuals.put(i, sumQuals.get(i) + qual);
} }
} }
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void decr(byte base) { public void incr(final byte base) {
BaseIndex i = BaseIndex.byteToBase(base); final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) // no Ns counts[i.index]++;
counts.put(i, counts.get(i) - 1); totalCount++;
}
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) + 1")
public void incr(final BaseIndex base, final byte qual) {
counts[base.index]++;
totalCount++;
sumQuals[base.index] += qual;
} }
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1") @Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(byte base, byte qual) { public void decr(final byte base) {
BaseIndex i = BaseIndex.byteToBase(base); final BaseIndex i = BaseIndex.byteToBase(base);
if (i != null) { // no Ns counts[i.index]--;
counts.put(i, counts.get(i) - 1); totalCount--;
sumQuals.put(i, sumQuals.get(i) - qual);
}
} }
@Ensures("totalCount() == old(totalCount()) || totalCount() == old(totalCount()) - 1")
public void decr(final BaseIndex base, final byte qual) {
@Ensures("result >= 0") counts[base.index]--;
public int getCount(byte base) { totalCount--;
return getCount(BaseIndex.byteToBase(base)); sumQuals[base.index] -= qual;
} }
@Ensures("result >= 0") @Ensures("result >= 0")
public int getCount(BaseIndex base) { public long getSumQuals(final byte base) {
return counts.get(base);
}
@Ensures("result >= 0")
public long getSumQuals(byte base) {
return getSumQuals(BaseIndex.byteToBase(base)); return getSumQuals(BaseIndex.byteToBase(base));
} }
@Ensures("result >= 0") @Ensures("result >= 0")
public long getSumQuals(BaseIndex base) { public long getSumQuals(final BaseIndex base) {
return sumQuals.get(base); return sumQuals[base.index];
} }
@Ensures("result >= 0") @Ensures("result >= 0")
public byte averageQuals(byte base) { public byte averageQuals(final byte base) {
return (byte) (getSumQuals(base) / getCount(base)); return (byte) (getSumQuals(base) / countOfBase(base));
} }
@Ensures("result >= 0") @Ensures("result >= 0")
public byte averageQuals(BaseIndex base) { public byte averageQuals(final BaseIndex base) {
return (byte) (getSumQuals(base) / getCount(base)); return (byte) (getSumQuals(base) / countOfBase(base));
}
@Ensures("result >= 0")
public int countOfBase(final byte base) {
return countOfBase(BaseIndex.byteToBase(base));
}
@Ensures("result >= 0")
public int countOfBase(final BaseIndex base) {
return counts[base.index];
}
@Ensures("result >= 0")
public long sumQualsOfBase(final BaseIndex base) {
return sumQuals[base.index];
}
@Ensures("result >= 0")
public byte averageQualsOfBase(final BaseIndex base) {
return (byte) (sumQualsOfBase(base) / countOfBase(base));
}
@Ensures("result >= 0")
public int totalCount() {
return totalCount;
}
/**
* Given a base , it returns the proportional count of this base compared to all other bases
*
* @param base base
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(final byte base) {
return baseCountProportion(BaseIndex.byteToBase(base));
}
/**
* Given a base , it returns the proportional count of this base compared to all other bases
*
* @param baseIndex base
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(final BaseIndex baseIndex) {
return (totalCount == 0) ? 0.0 : (double)counts[baseIndex.index] / (double)totalCount;
}
@Ensures("result != null")
public String toString() {
StringBuilder b = new StringBuilder();
for (final BaseIndex i : BaseIndex.values()) {
b.append(i.toString()).append("=").append(counts[i.index]).append(",");
}
return b.toString();
} }
public byte baseWithMostCounts() { public byte baseWithMostCounts() {
return baseIndexWithMostCounts().getByte(); return baseIndexWithMostCounts().getByte();
} }
@Ensures("result >= 0")
public int countOfMostCommonBase() {
return counts.get(baseIndexWithMostCounts());
}
@Ensures("result >= 0")
public long sumQualsOfMostCommonBase() {
return sumQuals.get(baseIndexWithMostCounts());
}
@Ensures("result >= 0")
public byte averageQualsOfMostCommonBase() {
return (byte) (sumQualsOfMostCommonBase() / countOfMostCommonBase());
}
@Ensures("result >= 0")
public int totalCount() {
int sum = 0;
for (int c : counts.values())
sum += c;
return sum;
}
/**
* Given a base , it returns the proportional count of this base compared to all other bases
*
* @param base
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(byte base) {
return (double) counts.get(BaseIndex.byteToBase(base)) / totalCount();
}
/**
* Given a base , it returns the proportional count of this base compared to all other bases
*
* @param baseIndex
* @return the proportion of this base over all other bases
*/
@Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportion(BaseIndex baseIndex) {
int total = totalCount();
if (total == 0)
return 0.0;
return (double) counts.get(baseIndex) / totalCount();
}
@Ensures("result != null")
public String toString() {
StringBuilder b = new StringBuilder();
for (Map.Entry<BaseIndex, Integer> elt : counts.entrySet()) {
b.append(elt.toString()).append("=").append(elt.getValue()).append(",");
}
return b.toString();
}
@Ensures("result != null") @Ensures("result != null")
public BaseIndex baseIndexWithMostCounts() { public BaseIndex baseIndexWithMostCounts() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS; BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (BaseIndex i : counts.keySet()) for (final BaseIndex i : BaseIndex.values()) {
if (counts.get(i) > counts.get(maxI)) if (counts[i.index] > counts[maxI.index])
maxI = i; maxI = i;
}
return maxI; return maxI;
} }
@Ensures("result != null") @Ensures("result != null")
public BaseIndex baseIndexWithMostCountsWithoutIndels() { public BaseIndex baseIndexWithMostCountsWithoutIndels() {
BaseIndex mostCounts = MAX_BASE_INDEX_WITH_NO_COUNTS; BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (BaseIndex index : counts.keySet()) for (final BaseIndex i : BaseIndex.values()) {
if (index.isNucleotide() && counts.get(index) > counts.get(mostCounts)) if (i.isNucleotide() && counts[i.index] > counts[maxI.index])
mostCounts = index; maxI = i;
return mostCounts; }
return maxI;
}
private boolean hasHigherCount(final BaseIndex targetIndex, final BaseIndex testIndex) {
final int targetCount = counts[targetIndex.index];
final int testCount = counts[testIndex.index];
return ( targetCount > testCount || (targetCount == testCount && sumQuals[targetIndex.index] > sumQuals[testIndex.index]) );
}
public byte baseWithMostProbability() {
return baseIndexWithMostProbability().getByte();
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbability() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (final BaseIndex i : BaseIndex.values()) {
if (sumQuals[i.index] > sumQuals[maxI.index])
maxI = i;
}
return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCounts());
}
@Ensures("result != null")
public BaseIndex baseIndexWithMostProbabilityWithoutIndels() {
BaseIndex maxI = MAX_BASE_INDEX_WITH_NO_COUNTS;
for (final BaseIndex i : BaseIndex.values()) {
if (i.isNucleotide() && sumQuals[i.index] > sumQuals[maxI.index])
maxI = i;
}
return (sumQuals[maxI.index] > 0L ? maxI : baseIndexWithMostCountsWithoutIndels());
} }
@Ensures("result >=0") @Ensures("result >=0")
public int totalCountWithoutIndels() { public int totalCountWithoutIndels() {
int sum = 0; return totalCount - counts[BaseIndex.D.index] - counts[BaseIndex.I.index];
for (BaseIndex index : counts.keySet())
if (index.isNucleotide())
sum += counts.get(index);
return sum;
} }
/** /**
* Calculates the proportional count of a base compared to all other bases except indels (I and D) * Calculates the proportional count of a base compared to all other bases except indels (I and D)
* *
* @param index * @param base base
* @return the proportion of this base over all other bases except indels * @return the proportion of this base over all other bases except indels
*/ */
@Requires("index.isNucleotide()") @Requires("base.isNucleotide()")
@Ensures({"result >=0.0", "result<= 1.0"}) @Ensures({"result >=0.0", "result<= 1.0"})
public double baseCountProportionWithoutIndels(BaseIndex index) { public double baseCountProportionWithoutIndels(final BaseIndex base) {
int total = totalCountWithoutIndels(); final int total = totalCountWithoutIndels();
if (total == 0) return (total == 0) ? 0.0 : (double)counts[base.index] / (double)total;
return 0.0; }
return (double) counts.get(index) / totalCountWithoutIndels();
public int[] countsArray() {
return counts.clone();
} }
} }

View File

@ -1,5 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads; package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
/** /**
* Simple byte / base index conversions * Simple byte / base index conversions
* *
@ -56,7 +58,7 @@ public enum BaseIndex {
case 'N': case 'N':
case 'n': case 'n':
return N; return N;
default: return null; default: throw new ReviewedStingException("Tried to create a byte index for an impossible base " + base);
} }
} }
@ -68,7 +70,7 @@ public enum BaseIndex {
* @return whether or not it is a nucleotide, given the definition above * @return whether or not it is a nucleotide, given the definition above
*/ */
public boolean isNucleotide() { public boolean isNucleotide() {
return this == A || this == C || this == G || this == T || this == N; return !isIndel();
} }
/** /**

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Arrays;
import java.util.LinkedList; import java.util.LinkedList;
/** /**
@ -156,11 +157,9 @@ public class HeaderElement {
* @return whether or not the HeaderElement is variant due to excess insertions * @return whether or not the HeaderElement is variant due to excess insertions
*/ */
private boolean isVariantFromInsertions(double minIndelProportion) { private boolean isVariantFromInsertions(double minIndelProportion) {
int numberOfBases = consensusBaseCounts.totalCount(); final int numberOfBases = consensusBaseCounts.totalCount();
if (numberOfBases == 0 && insertionsToTheRight > 0) if (numberOfBases == 0)
return true; // we only have insertions return (insertionsToTheRight > 0); // do we only have insertions?
else if (numberOfBases == 0)
return false; // we don't have anything
// if we have bases and insertions, check the ratio // if we have bases and insertions, check the ratio
return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion; return ((double) insertionsToTheRight / numberOfBases) > minIndelProportion;
@ -181,7 +180,7 @@ public class HeaderElement {
* @return whether or not the HeaderElement is variant due to excess insertions * @return whether or not the HeaderElement is variant due to excess insertions
*/ */
private boolean isVariantFromMismatches(double minVariantProportion) { private boolean isVariantFromMismatches(double minVariantProportion) {
BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostCountsWithoutIndels(); BaseIndex mostCommon = consensusBaseCounts.baseIndexWithMostProbabilityWithoutIndels();
double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon); double mostCommonProportion = consensusBaseCounts.baseCountProportionWithoutIndels(mostCommon);
return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion); return mostCommonProportion != 0.0 && mostCommonProportion < (1 - minVariantProportion);
} }
@ -200,5 +199,28 @@ public class HeaderElement {
return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual; return baseQual >= minBaseQual && baseMappingQuality >= minMappingQual;
} }
/**
* Calculates the number of haplotypes necessary to represent this site.
*
* @param minVariantProportion the minimum proportion to call a site variant.
* @return the number of haplotypes necessary to represent this site.
*/
public int getNumberOfHaplotypes(double minVariantProportion) {
int nHaplotypes = 0;
int totalCount = consensusBaseCounts.totalCount();
int runningCount = 0;
if (totalCount == 0)
return 0;
int[] countsArray = consensusBaseCounts.countsArray();
Arrays.sort(countsArray);
for (int i = countsArray.length-1; i>=0; i--) {
nHaplotypes++;
runningCount += countsArray[i];
if (runningCount/totalCount > minVariantProportion)
break;
}
return nHaplotypes;
}
} }

View File

@ -53,11 +53,13 @@ public class MultiSampleCompressor implements Compressor {
final double minAltProportionToTriggerVariant, final double minAltProportionToTriggerVariant,
final double minIndelProportionToTriggerVariant, final double minIndelProportionToTriggerVariant,
final int minBaseQual, final int minBaseQual,
final ReduceReads.DownsampleStrategy downsampleStrategy) { final ReduceReads.DownsampleStrategy downsampleStrategy,
final int nContigs,
final boolean allowPolyploidReduction) {
for ( String name : SampleUtils.getSAMFileSamples(header) ) { for ( String name : SampleUtils.getSAMFileSamples(header) ) {
compressorsPerSample.put(name, compressorsPerSample.put(name,
new SingleSampleCompressor(name, contextSize, downsampleCoverage, new SingleSampleCompressor(contextSize, downsampleCoverage,
minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, allowPolyploidReduction));
} }
} }

View File

@ -34,7 +34,7 @@ import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.*; import org.broadinstitute.sting.gatk.filters.*;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.PartitionBy; import org.broadinstitute.sting.gatk.walkers.PartitionBy;
import org.broadinstitute.sting.gatk.walkers.PartitionType; import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.gatk.walkers.ReadFilters; import org.broadinstitute.sting.gatk.walkers.ReadFilters;
@ -52,23 +52,23 @@ import java.util.*;
/** /**
* Reduces the BAM file using read based compression that keeps only essential information for variant calling * Reduces the BAM file using read based compression that keeps only essential information for variant calling
* <p/> *
* <p> * <p>
* This walker will generated reduced versions of the BAM files that still follow the BAM spec * This walker will generated reduced versions of the BAM files that still follow the BAM spec
* and contain all the information necessary for the GSA variant calling pipeline. Some options * and contain all the information necessary for the GSA variant calling pipeline. Some options
* allow you to tune in how much compression you want to achieve. The default values have been * allow you to tune in how much compression you want to achieve. The default values have been
* shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the * shown to reduce a typical whole exome BAM file 100x. The higher the coverage, the bigger the
* savings in file size and performance of the downstream tools. * savings in file size and performance of the downstream tools.
* <p/> *
* <h2>Input</h2> * <h2>Input</h2>
* <p> * <p>
* The BAM file to be compressed * The BAM file to be compressed
* </p> * </p>
* <p/> *
* <h2>Output</h2> * <h2>Output</h2>
* <p> * <p>
* The compressed (reduced) BAM file. * The compressed (reduced) BAM file.
* </p> *
* <p/> * <p/>
* <h2>Examples</h2> * <h2>Examples</h2>
* <pre> * <pre>
@ -86,13 +86,13 @@ import java.util.*;
public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> { public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceReadsStash> {
@Output @Output
protected StingSAMFileWriter out; private StingSAMFileWriter out;
/** /**
* The number of bases to keep around mismatches (potential variation) * The number of bases to keep around mismatches (potential variation)
*/ */
@Argument(fullName = "context_size", shortName = "cs", doc = "", required = false) @Argument(fullName = "context_size", shortName = "cs", doc = "", required = false)
protected int contextSize = 10; private int contextSize = 10;
/** /**
* The minimum mapping quality to be considered for the consensus synthetic read. Reads that have * The minimum mapping quality to be considered for the consensus synthetic read. Reads that have
@ -100,7 +100,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
* towards variable regions. * towards variable regions.
*/ */
@Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false) @Argument(fullName = "minimum_mapping_quality", shortName = "minmap", doc = "", required = false)
protected int minMappingQuality = 20; private int minMappingQuality = 20;
/** /**
* The minimum base quality to be considered for the consensus synthetic read. Reads that have * The minimum base quality to be considered for the consensus synthetic read. Reads that have
@ -108,35 +108,41 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
* towards variable regions. * towards variable regions.
*/ */
@Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false) @Argument(fullName = "minimum_base_quality_to_consider", shortName = "minqual", doc = "", required = false)
protected byte minBaseQual = 20; private byte minBaseQual = 20;
/** /**
* Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality * Reads have notoriously low quality bases on the tails (left and right). Consecutive bases with quality
* lower than this threshold will be hard clipped off before entering the reduce reads algorithm. * lower than this threshold will be hard clipped off before entering the reduce reads algorithm.
*/ */
@Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false) @Argument(fullName = "minimum_tail_qualities", shortName = "mintail", doc = "", required = false)
protected byte minTailQuality = 2; private byte minTailQuality = 2;
/**
* Allow the experimental polyploid-based reduction capabilities of this tool
*/
@Argument(fullName = "allow_polyploid_reduction", shortName = "polyploid", doc = "", required = false)
private boolean USE_POLYPLOID_REDUCTION = false;
/** /**
* Do not simplify read (strip away all extra information of the read -- anything other than bases, quals * Do not simplify read (strip away all extra information of the read -- anything other than bases, quals
* and read group). * and read group).
*/ */
@Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false) @Argument(fullName = "dont_simplify_reads", shortName = "nosimplify", doc = "", required = false)
protected boolean DONT_SIMPLIFY_READS = false; private boolean DONT_SIMPLIFY_READS = false;
/** /**
* Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired. * Do not hard clip adaptor sequences. Note: You don't have to turn this on for reads that are not mate paired.
* The program will behave correctly in those cases. * The program will behave correctly in those cases.
*/ */
@Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false) @Argument(fullName = "dont_hardclip_adaptor_sequences", shortName = "noclip_ad", doc = "", required = false)
protected boolean DONT_CLIP_ADAPTOR_SEQUENCES = false; private boolean DONT_CLIP_ADAPTOR_SEQUENCES = false;
/** /**
* Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail * Do not hard clip the low quality tails of the reads. This option overrides the argument of minimum tail
* quality. * quality.
*/ */
@Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false) @Argument(fullName = "dont_hardclip_low_qual_tails", shortName = "noclip_tail", doc = "", required = false)
protected boolean DONT_CLIP_LOW_QUAL_TAILS = false; private boolean DONT_CLIP_LOW_QUAL_TAILS = false;
/** /**
* Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped * Do not use high quality soft-clipped bases. By default, ReduceReads will hard clip away any low quality soft clipped
@ -144,7 +150,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
* regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual) * regions. The minimum quality for soft clipped bases is the same as the minimum base quality to consider (minqual)
*/ */
@Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false) @Argument(fullName = "dont_use_softclipped_bases", shortName = "no_soft", doc = "", required = false)
protected boolean DONT_USE_SOFTCLIPPED_BASES = false; private boolean DONT_USE_SOFTCLIPPED_BASES = false;
/** /**
* Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee * Do not compress read names. By default, ReduceReads will compress read names to numbers and guarantee
@ -152,47 +158,55 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
* there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing. * there is no guarantee that read name uniqueness will be maintained -- in this case we recommend not compressing.
*/ */
@Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false) @Argument(fullName = "dont_compress_read_names", shortName = "nocmp_names", doc = "", required = false)
protected boolean DONT_COMPRESS_READ_NAMES = false; private boolean DONT_COMPRESS_READ_NAMES = false;
/** /**
* Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval * Optionally hard clip all incoming reads to the desired intervals. The hard clips will happen exactly at the interval
* border. * border.
*/ */
@Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false) @Argument(fullName = "hard_clip_to_interval", shortName = "clip_int", doc = "", required = false)
protected boolean HARD_CLIP_TO_INTERVAL = false; private boolean HARD_CLIP_TO_INTERVAL = false;
/** /**
* Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be * Minimum proportion of mismatches in a site to trigger a variant region. Anything below this will be
* considered consensus. * considered consensus.
*/ */
@Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false) @Argument(fullName = "minimum_alt_proportion_to_trigger_variant", shortName = "minvar", doc = "", required = false)
protected double minAltProportionToTriggerVariant = 0.05; private double minAltProportionToTriggerVariant = 0.05;
/** /**
* Minimum proportion of indels in a site to trigger a variant region. Anything below this will be * Minimum proportion of indels in a site to trigger a variant region. Anything below this will be
* considered consensus. * considered consensus.
*/ */
@Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false) @Argument(fullName = "minimum_del_proportion_to_trigger_variant", shortName = "mindel", doc = "", required = false)
protected double minIndelProportionToTriggerVariant = 0.05; private double minIndelProportionToTriggerVariant = 0.05;
/** /**
* Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this). * Downsamples the coverage of a variable region approximately (guarantees the minimum to be equal to this).
* A value of 0 turns downsampling off. * A value of 0 turns downsampling off.
*/ */
@Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false) @Argument(fullName = "downsample_coverage", shortName = "ds", doc = "", required = false)
protected int downsampleCoverage = 250; private int downsampleCoverage = 250;
/**
* Number of chromossomes in the sample (this is used for the polyploid consensus compression). Only
* tested for humans (or organisms with n=2). Use at your own risk!
*/
@Hidden
@Argument(fullName = "contigs", shortName = "ctg", doc = "", required = false)
private int nContigs = 2;
@Hidden @Hidden
@Argument(fullName = "", shortName = "dl", doc = "", required = false) @Argument(fullName = "", shortName = "dl", doc = "", required = false)
protected int debugLevel = 0; private int debugLevel = 0;
@Hidden @Hidden
@Argument(fullName = "", shortName = "dr", doc = "", required = false) @Argument(fullName = "", shortName = "dr", doc = "", required = false)
protected String debugRead = ""; private String debugRead = "";
@Hidden @Hidden
@Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false) @Argument(fullName = "downsample_strategy", shortName = "dm", doc = "", required = false)
protected DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal; private DownsampleStrategy downsampleStrategy = DownsampleStrategy.Normal;
@Hidden @Hidden
@Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false) @Argument(fullName = "no_pg_tag", shortName = "npt", doc ="", required = false)
@ -203,7 +217,6 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
Adaptive Adaptive
} }
protected int totalReads = 0;
int nCompressedReads = 0; int nCompressedReads = 0;
HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number). HashMap<String, Long> readNameHash; // This hash will keep the name of the original read the new compressed name (a number).
@ -247,16 +260,15 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
* @return a linked list with all the reads produced by the clipping operations * @return a linked list with all the reads produced by the clipping operations
*/ */
@Override @Override
public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { public LinkedList<GATKSAMRecord> map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
LinkedList<GATKSAMRecord> mappedReads; LinkedList<GATKSAMRecord> mappedReads;
totalReads++;
if (!debugRead.isEmpty() && read.getReadName().contains(debugRead)) if (!debugRead.isEmpty() && read.getReadName().contains(debugRead))
System.out.println("Found debug read!"); System.out.println("Found debug read!");
if (debugLevel == 1) if (debugLevel == 1)
System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd()); System.out.printf("\nOriginal: %s %s %d %d\n", read, read.getCigar(), read.getAlignmentStart(), read.getAlignmentEnd());
// we write the actual alignment starts to their respectiv alignment shift tags in the temporary // we write the actual alignment starts to their respective alignment shift tags in the temporary
// attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file // attribute hash so we can determine later if we need to write down the alignment shift to the reduced BAM file
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart()); read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, read.getAlignmentStart());
read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd()); read.setTemporaryAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, read.getAlignmentEnd());
@ -316,7 +328,7 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
*/ */
@Override @Override
public ReduceReadsStash reduceInit() { public ReduceReadsStash reduceInit() {
return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy)); return new ReduceReadsStash(new MultiSampleCompressor(getToolkit().getSAMFileHeader(), contextSize, downsampleCoverage, minMappingQuality, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, downsampleStrategy, nContigs, USE_POLYPLOID_REDUCTION));
} }
/** /**
@ -532,8 +544,6 @@ public class ReduceReads extends ReadWalker<LinkedList<GATKSAMRecord>, ReduceRea
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start) read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_START_SHIFT, startShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (start)
if (endShift > 0) if (endShift > 0)
read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end) read.setAttribute(GATKSAMRecord.REDUCED_READ_ORIGINAL_ALIGNMENT_END_SHIFT, endShift); // If the read had any soft clips before getting chopped (variant region) annotate it's original alignment (end)
totalReads++;
} }
if (debugLevel == 1) if (debugLevel == 1)

View File

@ -1,6 +1,5 @@
package org.broadinstitute.sting.gatk.walkers.compression.reducereads; package org.broadinstitute.sting.gatk.walkers.compression.reducereads;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator; import org.broadinstitute.sting.utils.sam.AlignmentStartWithNoTiesComparator;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -8,35 +7,33 @@ import java.util.TreeSet;
/** /**
* *
* @author depristo * @author carneiro, depristo
* @version 0.1 * @version 3.0
*/ */
public class SingleSampleCompressor implements Compressor { public class SingleSampleCompressor implements Compressor {
protected static final Logger logger = Logger.getLogger(SingleSampleCompressor.class); final private int contextSize;
final private int downsampleCoverage;
final private int minMappingQuality;
final private double minAltProportionToTriggerVariant;
final private double minIndelProportionToTriggerVariant;
final private int minBaseQual;
final private ReduceReads.DownsampleStrategy downsampleStrategy;
final private int nContigs;
final private boolean allowPolyploidReduction;
protected final int contextSize; private SlidingWindow slidingWindow;
protected final int downsampleCoverage; private int slidingWindowCounter;
protected int minMappingQuality;
protected int slidingWindowCounter;
protected final String sampleName;
protected SlidingWindow slidingWindow; public SingleSampleCompressor(final int contextSize,
protected double minAltProportionToTriggerVariant;
protected double minIndelProportionToTriggerVariant;
protected int minBaseQual;
protected ReduceReads.DownsampleStrategy downsampleStrategy;
public SingleSampleCompressor(final String sampleName,
final int contextSize,
final int downsampleCoverage, final int downsampleCoverage,
final int minMappingQuality, final int minMappingQuality,
final double minAltProportionToTriggerVariant, final double minAltProportionToTriggerVariant,
final double minIndelProportionToTriggerVariant, final double minIndelProportionToTriggerVariant,
final int minBaseQual, final int minBaseQual,
final ReduceReads.DownsampleStrategy downsampleStrategy) { final ReduceReads.DownsampleStrategy downsampleStrategy,
this.sampleName = sampleName; final int nContigs,
final boolean allowPolyploidReduction) {
this.contextSize = contextSize; this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage; this.downsampleCoverage = downsampleCoverage;
this.minMappingQuality = minMappingQuality; this.minMappingQuality = minMappingQuality;
@ -45,6 +42,8 @@ public class SingleSampleCompressor implements Compressor {
this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant; this.minIndelProportionToTriggerVariant = minIndelProportionToTriggerVariant;
this.minBaseQual = minBaseQual; this.minBaseQual = minBaseQual;
this.downsampleStrategy = downsampleStrategy; this.downsampleStrategy = downsampleStrategy;
this.nContigs = nContigs;
this.allowPolyploidReduction = allowPolyploidReduction;
} }
/** /**
@ -66,7 +65,7 @@ public class SingleSampleCompressor implements Compressor {
} }
if ( slidingWindow == null) { // this is the first read if ( slidingWindow == null) { // this is the first read
slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities()); slidingWindow = new SlidingWindow(read.getReferenceName(), read.getReferenceIndex(), contextSize, read.getHeader(), read.getReadGroup(), slidingWindowCounter, minAltProportionToTriggerVariant, minIndelProportionToTriggerVariant, minBaseQual, minMappingQuality, downsampleCoverage, downsampleStrategy, read.hasBaseIndelQualities(), nContigs, allowPolyploidReduction);
slidingWindowCounter++; slidingWindowCounter++;
} }

View File

@ -8,14 +8,12 @@ import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler; import org.broadinstitute.sting.gatk.downsampling.ReservoirDownsampler;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.sam.ReadUtils;
import java.util.Iterator; import java.util.*;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
@ -26,13 +24,12 @@ import java.util.ListIterator;
public class SlidingWindow { public class SlidingWindow {
// Sliding Window data // Sliding Window data
final private LinkedList<GATKSAMRecord> readsInWindow; final private TreeSet<GATKSAMRecord> readsInWindow;
final private LinkedList<HeaderElement> windowHeader; final private LinkedList<HeaderElement> windowHeader;
protected int contextSize; // the largest context size (between mismatches and indels) protected int contextSize; // the largest context size (between mismatches and indels)
protected int stopLocation;
protected String contig; protected String contig;
protected int contigIndex; protected int contigIndex;
protected SAMFileHeader header; protected SAMFileHeader samHeader;
protected GATKSAMReadGroupRecord readGroupAttribute; protected GATKSAMReadGroupRecord readGroupAttribute;
protected int downsampleCoverage; protected int downsampleCoverage;
@ -56,6 +53,10 @@ public class SlidingWindow {
protected ReduceReads.DownsampleStrategy downsampleStrategy; protected ReduceReads.DownsampleStrategy downsampleStrategy;
private boolean hasIndelQualities; private boolean hasIndelQualities;
private final int nContigs;
private boolean allowPolyploidReductionInGeneral;
/** /**
* The types of synthetic reads to use in the finalizeAndAdd method * The types of synthetic reads to use in the finalizeAndAdd method
*/ */
@ -66,7 +67,11 @@ public class SlidingWindow {
} }
public int getStopLocation() { public int getStopLocation() {
return stopLocation; return getStopLocation(windowHeader);
}
private int getStopLocation(LinkedList<HeaderElement> header) {
return getStartLocation(header) + header.size() - 1;
} }
public String getContig() { public String getContig() {
@ -77,13 +82,12 @@ public class SlidingWindow {
return contigIndex; return contigIndex;
} }
public int getStartLocation() { public int getStartLocation(LinkedList<HeaderElement> header) {
return windowHeader.isEmpty() ? -1 : windowHeader.peek().getLocation(); return header.isEmpty() ? -1 : header.peek().getLocation();
} }
public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader header, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities) { public SlidingWindow(String contig, int contigIndex, int contextSize, SAMFileHeader samHeader, GATKSAMReadGroupRecord readGroupAttribute, int windowNumber, final double minAltProportionToTriggerVariant, final double minIndelProportionToTriggerVariant, int minBaseQual, int minMappingQuality, int downsampleCoverage, final ReduceReads.DownsampleStrategy downsampleStrategy, boolean hasIndelQualities, int nContigs, boolean allowPolyploidReduction) {
this.stopLocation = -1;
this.contextSize = contextSize; this.contextSize = contextSize;
this.downsampleCoverage = downsampleCoverage; this.downsampleCoverage = downsampleCoverage;
@ -93,11 +97,17 @@ public class SlidingWindow {
this.MIN_MAPPING_QUALITY = minMappingQuality; this.MIN_MAPPING_QUALITY = minMappingQuality;
this.windowHeader = new LinkedList<HeaderElement>(); this.windowHeader = new LinkedList<HeaderElement>();
this.readsInWindow = new LinkedList<GATKSAMRecord>(); this.readsInWindow = new TreeSet<GATKSAMRecord>(new Comparator<GATKSAMRecord>() {
@Override
public int compare(GATKSAMRecord read1, GATKSAMRecord read2) {
final int difference = read1.getSoftEnd() - read2.getSoftEnd();
return difference != 0 ? difference : read1.getReadName().compareTo(read2.getReadName());
}
});
this.contig = contig; this.contig = contig;
this.contigIndex = contigIndex; this.contigIndex = contigIndex;
this.header = header; this.samHeader = samHeader;
this.readGroupAttribute = readGroupAttribute; this.readGroupAttribute = readGroupAttribute;
this.consensusCounter = 0; this.consensusCounter = 0;
@ -111,6 +121,9 @@ public class SlidingWindow {
this.downsampleStrategy = downsampleStrategy; this.downsampleStrategy = downsampleStrategy;
this.hasIndelQualities = hasIndelQualities; this.hasIndelQualities = hasIndelQualities;
this.nContigs = nContigs;
this.allowPolyploidReductionInGeneral = allowPolyploidReduction;
} }
/** /**
@ -125,7 +138,7 @@ public class SlidingWindow {
* @return a list of reads that have been finished by sliding the window. * @return a list of reads that have been finished by sliding the window.
*/ */
public List<GATKSAMRecord> addRead(GATKSAMRecord read) { public List<GATKSAMRecord> addRead(GATKSAMRecord read) {
updateHeaderCounts(read, false); // update the window header counts addToHeader(windowHeader, read); // update the window header counts
readsInWindow.add(read); // add read to sliding reads readsInWindow.add(read); // add read to sliding reads
return slideWindow(read.getUnclippedStart()); return slideWindow(read.getUnclippedStart());
} }
@ -188,54 +201,105 @@ public class SlidingWindow {
* @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start! * @param incomingReadUnclippedStart the incoming read's start position. Must be the unclipped start!
* @return all reads that have fallen to the left of the sliding window after the slide * @return all reads that have fallen to the left of the sliding window after the slide
*/ */
protected List<GATKSAMRecord> slideWindow(int incomingReadUnclippedStart) { protected List<GATKSAMRecord> slideWindow(final int incomingReadUnclippedStart) {
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>(); List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
if (incomingReadUnclippedStart - contextSize > getStartLocation()) { final int windowHeaderStartLocation = getStartLocation(windowHeader);
int readStartHeaderIndex = incomingReadUnclippedStart - getStartLocation();
boolean[] variantSite = markSites(getStartLocation() + readStartHeaderIndex); if (incomingReadUnclippedStart - contextSize > windowHeaderStartLocation) {
markSites(incomingReadUnclippedStart);
int readStartHeaderIndex = incomingReadUnclippedStart - windowHeaderStartLocation;
int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive) int breakpoint = Math.max(readStartHeaderIndex - contextSize - 1, 0); // this is the limit of what we can close/send to consensus (non-inclusive)
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, variantSite); List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet());
finalizedReads = closeVariantRegions(regions, false); finalizedReads = closeVariantRegions(regions, false);
List<GATKSAMRecord> readsToRemove = new LinkedList<GATKSAMRecord>(); while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) {
for (GATKSAMRecord read : readsInWindow) { // todo -- unnecessarily going through all reads in the window !! Optimize this (But remember reads are not sorted by alignment end!) readsInWindow.pollFirst();
if (read.getAlignmentEnd() < getStartLocation()) {
readsToRemove.add(read);
}
}
for (GATKSAMRecord read : readsToRemove) {
readsInWindow.remove(read);
} }
} }
return finalizedReads; return finalizedReads;
} }
private final class MarkedSites {
private boolean[] siteIsVariant = new boolean[0];
private int startLocation = 0;
public MarkedSites() {}
public boolean[] getVariantSiteBitSet() { return siteIsVariant; }
/**
* Updates the variant site bitset given the new startlocation and size of the region to mark.
*
* @param newStartLocation the new start location of the bitset
* @param sizeOfRegion the new size of the region to be represented
*
* @return the end position (newStartLocation + index) of the region marked by this method; the calling method is responsible for the remainder.
*/
public int updateRegion(final int newStartLocation, final int sizeOfRegion) {
int lastPositionMarked = sizeOfRegion;
// if this is the first time we set the array and we can't reuse anything, just create a new array from scratch
if ( newStartLocation >= this.startLocation + siteIsVariant.length || newStartLocation < this.startLocation ) {
siteIsVariant = new boolean[sizeOfRegion];
lastPositionMarked = 0;
}
// if the dimensions change, copy what we can and continue
else if ( newStartLocation != this.startLocation || sizeOfRegion != siteIsVariant.length ) {
final boolean[] tempArray = new boolean[sizeOfRegion];
final int differenceInStartPositions = newStartLocation - this.startLocation;
lastPositionMarked = Math.min(siteIsVariant.length - differenceInStartPositions, sizeOfRegion);
System.arraycopy(siteIsVariant, differenceInStartPositions, tempArray, 0, lastPositionMarked);
siteIsVariant = null; // explicitly allow garbage collection
siteIsVariant = tempArray;
}
this.startLocation = newStartLocation;
return lastPositionMarked + newStartLocation;
}
}
private final MarkedSites markedSites = new MarkedSites();
/** /**
* returns an array marked with variant and non-variant regions (it uses * returns an array marked with variant and non-variant regions (it uses
* markVariantRegions to make the marks) * markVariantRegions to make the marks)
* *
* @param stop check the window from start to stop (not-inclusive) * @param stop check the window from start to stop (not-inclusive)
* @return a boolean array with 'true' marking variant regions and false marking consensus sites
*/ */
protected boolean[] markSites(int stop) { protected void markSites(final int stop) {
boolean[] markedSites = new boolean[stop - getStartLocation() + contextSize + 1]; final int windowHeaderStartLocation = getStartLocation(windowHeader);
final int sizeOfMarkedRegion = stop - windowHeaderStartLocation + contextSize + 1;
// copy over as many bits as we can from the previous calculation. Note that we can't trust the
// last (contextSize - 1) worth of bits because we may not have actually looked at variant regions there.
final int lastPositionMarked = markedSites.updateRegion(windowHeaderStartLocation, sizeOfMarkedRegion) - contextSize - 1;
final int locationToProcess = Math.min(lastPositionMarked, stop - contextSize);
// update the iterator to the correct position
Iterator<HeaderElement> headerElementIterator = windowHeader.iterator(); Iterator<HeaderElement> headerElementIterator = windowHeader.iterator();
for (int i = getStartLocation(); i < stop; i++) { for (int i = windowHeaderStartLocation; i < locationToProcess; i++) {
if (headerElementIterator.hasNext())
headerElementIterator.next();
}
// process a contextSize worth of region from scratch in case there's a variant there
for (int i = locationToProcess; i < stop; i++) {
if (headerElementIterator.hasNext()) { if (headerElementIterator.hasNext()) {
HeaderElement headerElement = headerElementIterator.next(); HeaderElement headerElement = headerElementIterator.next();
if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT)) if (headerElement.isVariant(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT, MIN_INDEL_BASE_PROPORTION_TO_TRIGGER_VARIANT))
markVariantRegion(markedSites, i - getStartLocation()); markVariantRegion(markedSites, i - windowHeaderStartLocation);
} else } else
break; break;
} }
return markedSites;
} }
/** /**
@ -244,11 +308,11 @@ public class SlidingWindow {
* @param markedSites the boolean array to bear the marks * @param markedSites the boolean array to bear the marks
* @param variantSiteLocation the location where a variant site was found * @param variantSiteLocation the location where a variant site was found
*/ */
protected void markVariantRegion(boolean[] markedSites, int variantSiteLocation) { protected void markVariantRegion(final MarkedSites markedSites, final int variantSiteLocation) {
int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize; int from = (variantSiteLocation < contextSize) ? 0 : variantSiteLocation - contextSize;
int to = (variantSiteLocation + contextSize + 1 > markedSites.length) ? markedSites.length : variantSiteLocation + contextSize + 1; int to = (variantSiteLocation + contextSize + 1 > markedSites.getVariantSiteBitSet().length) ? markedSites.getVariantSiteBitSet().length : variantSiteLocation + contextSize + 1;
for (int i = from; i < to; i++) for (int i = from; i < to; i++)
markedSites[i] = true; markedSites.getVariantSiteBitSet()[i] = true;
} }
/** /**
@ -260,46 +324,45 @@ public class SlidingWindow {
* @param end the first header index NOT TO add to consensus * @param end the first header index NOT TO add to consensus
* @return a list of consensus reads generated by this call. Empty list if no consensus was generated. * @return a list of consensus reads generated by this call. Empty list if no consensus was generated.
*/ */
protected List<GATKSAMRecord> addToSyntheticReads(int start, int end) { protected List<GATKSAMRecord> addToSyntheticReads(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
LinkedList<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>(); LinkedList<GATKSAMRecord> reads = new LinkedList<GATKSAMRecord>();
if (start < end) { if (start < end) {
ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start);
if (!headerElementIterator.hasNext()) if (!headerElementIterator.hasNext())
throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, windowHeader.size(), end)); throw new ReviewedStingException(String.format("Requested to add to synthetic reads a region that contains no header element at index: %d - %d / %d", start, header.size(), end));
HeaderElement headerElement = headerElementIterator.next(); HeaderElement headerElement = headerElementIterator.next();
if (headerElement.hasConsensusData()) { if (headerElement.hasConsensusData()) {
reads.addAll(finalizeAndAdd(ConsensusType.FILTERED)); reads.addAll(finalizeAndAdd(ConsensusType.FILTERED));
int endOfConsensus = findNextNonConsensusElement(start, end); int endOfConsensus = findNextNonConsensusElement(header, start, end);
addToRunningConsensus(start, endOfConsensus); addToRunningConsensus(header, start, endOfConsensus, isNegativeStrand);
if (endOfConsensus <= start) if (endOfConsensus <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start)); throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfConsensus, start));
reads.addAll(addToSyntheticReads(endOfConsensus, end)); reads.addAll(addToSyntheticReads(header, endOfConsensus, end, isNegativeStrand));
} else if (headerElement.hasFilteredData()) { } else if (headerElement.hasFilteredData()) {
reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS)); reads.addAll(finalizeAndAdd(ConsensusType.CONSENSUS));
int endOfFilteredData = findNextNonFilteredDataElement(start, end); int endOfFilteredData = findNextNonFilteredDataElement(header, start, end);
addToFilteredData(start, endOfFilteredData); reads.addAll(addToFilteredData(header, start, endOfFilteredData, isNegativeStrand));
if (endOfFilteredData <= start) if (endOfFilteredData <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start)); throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfFilteredData, start));
reads.addAll(addToSyntheticReads(endOfFilteredData, end)); reads.addAll(addToSyntheticReads(header, endOfFilteredData, end, isNegativeStrand));
} else if (headerElement.isEmpty()) { } else if (headerElement.isEmpty()) {
reads.addAll(finalizeAndAdd(ConsensusType.BOTH)); reads.addAll(finalizeAndAdd(ConsensusType.BOTH));
int endOfEmptyData = findNextNonEmptyElement(start, end); int endOfEmptyData = findNextNonEmptyElement(header, start, end);
if (endOfEmptyData <= start) if (endOfEmptyData <= start)
throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start)); throw new ReviewedStingException(String.format("next start is <= current start: (%d <= %d)", endOfEmptyData, start));
reads.addAll(addToSyntheticReads(endOfEmptyData, end)); reads.addAll(addToSyntheticReads(header, endOfEmptyData, end, isNegativeStrand));
} else } else
throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start)); throw new ReviewedStingException(String.format("Header Element %d is neither Consensus, Data or Empty. Something is wrong.", start));
@ -343,8 +406,8 @@ public class SlidingWindow {
* @param upTo limit to search for another consensus element * @param upTo limit to search for another consensus element
* @return next position with consensus data or empty * @return next position with consensus data or empty
*/ */
private int findNextNonConsensusElement(int start, int upTo) { private int findNextNonConsensusElement(LinkedList<HeaderElement> header, int start, int upTo) {
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start); Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
int index = start; int index = start;
while (index < upTo) { while (index < upTo) {
if (!headerElementIterator.hasNext()) if (!headerElementIterator.hasNext())
@ -365,8 +428,8 @@ public class SlidingWindow {
* @param upTo limit to search for * @param upTo limit to search for
* @return next position with no filtered data * @return next position with no filtered data
*/ */
private int findNextNonFilteredDataElement(int start, int upTo) { private int findNextNonFilteredDataElement(LinkedList<HeaderElement> header, int start, int upTo) {
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start); Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
int index = start; int index = start;
while (index < upTo) { while (index < upTo) {
if (!headerElementIterator.hasNext()) if (!headerElementIterator.hasNext())
@ -387,8 +450,8 @@ public class SlidingWindow {
* @param upTo limit to search for * @param upTo limit to search for
* @return next position with non-empty element * @return next position with non-empty element
*/ */
private int findNextNonEmptyElement(int start, int upTo) { private int findNextNonEmptyElement(LinkedList<HeaderElement> header, int start, int upTo) {
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start); ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
int index = start; int index = start;
while (index < upTo) { while (index < upTo) {
if (!headerElementIterator.hasNext()) if (!headerElementIterator.hasNext())
@ -412,11 +475,13 @@ public class SlidingWindow {
* @param start the first header index to add to consensus * @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus * @param end the first header index NOT TO add to consensus
*/ */
private void addToFilteredData(int start, int end) { private List<GATKSAMRecord> addToFilteredData(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
if (filteredDataConsensus == null) List<GATKSAMRecord> result = new ArrayList<GATKSAMRecord>(0);
filteredDataConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities);
ListIterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start); if (filteredDataConsensus == null)
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
ListIterator<HeaderElement> headerElementIterator = header.listIterator(start);
for (int index = start; index < end; index++) { for (int index = start; index < end; index++) {
if (!headerElementIterator.hasNext()) if (!headerElementIterator.hasNext())
throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist"); throw new ReviewedStingException("Requested to create a filtered data synthetic read from " + start + " to " + end + " but " + index + " does not exist");
@ -428,8 +493,15 @@ public class SlidingWindow {
if (!headerElement.hasFilteredData()) if (!headerElement.hasFilteredData())
throw new ReviewedStingException("No filtered data in " + index); throw new ReviewedStingException("No filtered data in " + index);
if ( filteredDataConsensus.getRefStart() + filteredDataConsensus.size() != headerElement.getLocation() ) {
result.add(finalizeFilteredDataConsensus());
filteredDataConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, filteredDataReadName + filteredDataConsensusCounter++, headerElement.getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
}
genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS()); genericAddBaseToConsensus(filteredDataConsensus, headerElement.getFilteredBaseCounts(), headerElement.getRMS());
} }
return result;
} }
/** /**
@ -441,11 +513,11 @@ public class SlidingWindow {
* @param start the first header index to add to consensus * @param start the first header index to add to consensus
* @param end the first header index NOT TO add to consensus * @param end the first header index NOT TO add to consensus
*/ */
private void addToRunningConsensus(int start, int end) { private void addToRunningConsensus(LinkedList<HeaderElement> header, int start, int end, boolean isNegativeStrand) {
if (runningConsensus == null) if (runningConsensus == null)
runningConsensus = new SyntheticRead(header, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, windowHeader.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities); runningConsensus = new SyntheticRead(samHeader, readGroupAttribute, contig, contigIndex, consensusReadName + consensusCounter++, header.get(start).getLocation(), GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, hasIndelQualities, isNegativeStrand);
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(start); Iterator<HeaderElement> headerElementIterator = header.listIterator(start);
for (int index = start; index < end; index++) { for (int index = start; index < end; index++) {
if (!headerElementIterator.hasNext()) if (!headerElementIterator.hasNext())
throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist"); throw new ReviewedStingException("Requested to create a running consensus synthetic read from " + start + " to " + end + " but " + index + " does not exist");
@ -466,14 +538,76 @@ public class SlidingWindow {
* @param rms the rms mapping quality in the header element * @param rms the rms mapping quality in the header element
*/ */
private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) { private void genericAddBaseToConsensus(SyntheticRead syntheticRead, BaseAndQualsCounts baseCounts, double rms) {
BaseIndex base = baseCounts.baseIndexWithMostCounts(); final BaseIndex base = baseCounts.baseIndexWithMostProbability();
byte count = (byte) Math.min(baseCounts.countOfMostCommonBase(), Byte.MAX_VALUE); byte count = (byte) Math.min(baseCounts.countOfBase(base), Byte.MAX_VALUE);
byte qual = baseCounts.averageQualsOfMostCommonBase(); byte qual = baseCounts.averageQualsOfBase(base);
byte insQual = baseCounts.averageInsertionQualsOfMostCommonBase(); byte insQual = baseCounts.averageInsertionQualsOfBase(base);
byte delQual = baseCounts.averageDeletionQualsOfMostCommonBase(); byte delQual = baseCounts.averageDeletionQualsOfBase(base);
syntheticRead.add(base, count, qual, insQual, delQual, rms); syntheticRead.add(base, count, qual, insQual, delQual, rms);
} }
private List<GATKSAMRecord> compressVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>();
// Try to compress into a polyploid consensus
int nHaplotypes = 0;
int hetRefPosition = -1;
boolean canCompress = true;
boolean foundEvent = false;
Object[] header = windowHeader.toArray();
// foundEvent will remain false if we don't allow polyploid reduction
if ( allowPolyploidReductionInGeneral && !disallowPolyploidReductionAtThisPosition ) {
for (int i = start; i<=stop; i++) {
nHaplotypes = ((HeaderElement) header[i]).getNumberOfHaplotypes(MIN_ALT_BASE_PROPORTION_TO_TRIGGER_VARIANT);
if (nHaplotypes > nContigs) {
canCompress = false;
break;
}
// guarantees that there is only 1 site in the variant region that needs more than one haplotype
if (nHaplotypes > 1) {
if (!foundEvent) {
foundEvent = true;
hetRefPosition = i;
}
else {
canCompress = false;
break;
}
}
}
}
// Try to compress the variant region
// the "foundEvent" protects us from trying to compress variant regions that are created by insertions
if (canCompress && foundEvent) {
allReads = createPolyploidConsensus(start, stop, nHaplotypes, ((HeaderElement) header[hetRefPosition]).getLocation());
}
// Return all reads that overlap the variant region and remove them from the window header entirely
// also remove all reads preceding the variant region (since they will be output as consensus right after compression
else {
final int refStart = windowHeader.get(start).getLocation();
final int refStop = windowHeader.get(stop).getLocation();
LinkedList<GATKSAMRecord> toRemove = new LinkedList<GATKSAMRecord>();
for (GATKSAMRecord read : readsInWindow) {
if (read.getSoftStart() <= refStop) {
if (read.getAlignmentEnd() >= refStart) {
allReads.add(read);
removeFromHeader(windowHeader, read);
}
toRemove.add(read);
}
}
for (GATKSAMRecord read : toRemove) {
readsInWindow.remove(read);
}
}
return allReads;
}
/** /**
* Finalizes a variant region, any adjacent synthetic reads. * Finalizes a variant region, any adjacent synthetic reads.
* *
@ -482,27 +616,13 @@ public class SlidingWindow {
* @return all reads contained in the variant region plus any adjacent synthetic reads * @return all reads contained in the variant region plus any adjacent synthetic reads
*/ */
@Requires("start <= stop") @Requires("start <= stop")
protected List<GATKSAMRecord> closeVariantRegion(int start, int stop) { protected List<GATKSAMRecord> closeVariantRegion(final int start, final int stop, final boolean disallowPolyploidReductionAtThisPosition) {
List<GATKSAMRecord> allReads = new LinkedList<GATKSAMRecord>(); List<GATKSAMRecord> allReads = compressVariantRegion(start, stop, disallowPolyploidReductionAtThisPosition);
int refStart = windowHeader.get(start).getLocation(); // All operations are reference based, not read based
int refStop = windowHeader.get(stop).getLocation();
for (GATKSAMRecord read : readsInWindow) { // Keep all reads that overlap the variant region
if (read.getSoftStart() <= refStop && read.getAlignmentEnd() >= refStart) {
allReads.add(read);
updateHeaderCounts(read, true); // Remove this read from the window header entirely
}
}
List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads; List<GATKSAMRecord> result = (downsampleCoverage > 0) ? downsampleVariantRegion(allReads) : allReads;
result.addAll(addToSyntheticReads(0, start)); result.addAll(addToSyntheticReads(windowHeader, 0, stop, false));
result.addAll(finalizeAndAdd(ConsensusType.BOTH)); result.addAll(finalizeAndAdd(ConsensusType.BOTH));
for (GATKSAMRecord read : allReads) {
readsInWindow.remove(read); // todo -- not optimal, but needs to be done so the next region doesn't try to remove the same reads from the header counts.
}
return result; // finalized reads will be downsampled if necessary return result; // finalized reads will be downsampled if necessary
} }
@ -517,7 +637,7 @@ public class SlidingWindow {
if (stop < 0 && forceClose) if (stop < 0 && forceClose)
stop = windowHeader.size() - 1; stop = windowHeader.size() - 1;
if (stop >= 0) { if (stop >= 0) {
allReads.addAll(closeVariantRegion(start, stop)); allReads.addAll(closeVariantRegion(start, stop, regions.size() > 1));
lastStop = stop; lastStop = stop;
} }
} }
@ -545,7 +665,7 @@ public class SlidingWindow {
ReservoirDownsampler <GATKSAMRecord> downsampler = new ReservoirDownsampler<GATKSAMRecord>(downsampleCoverage); ReservoirDownsampler <GATKSAMRecord> downsampler = new ReservoirDownsampler<GATKSAMRecord>(downsampleCoverage);
downsampler.submit(allReads); downsampler.submit(allReads);
return downsampler.consumeDownsampledItems(); return downsampler.consumeFinalizedItems();
} }
@ -561,16 +681,17 @@ public class SlidingWindow {
List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>(); List<GATKSAMRecord> finalizedReads = new LinkedList<GATKSAMRecord>();
if (!windowHeader.isEmpty()) { if (!windowHeader.isEmpty()) {
boolean[] variantSite = markSites(stopLocation + 1); markSites(getStopLocation(windowHeader) + 1);
List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), variantSite); List<Pair<Integer,Integer>> regions = getAllVariantRegions(0, windowHeader.size(), markedSites.getVariantSiteBitSet());
finalizedReads = closeVariantRegions(regions, true); finalizedReads = closeVariantRegions(regions, true);
if (!windowHeader.isEmpty()) { if (!windowHeader.isEmpty()) {
finalizedReads.addAll(addToSyntheticReads(0, windowHeader.size() - 1)); finalizedReads.addAll(addToSyntheticReads(windowHeader, 0, windowHeader.size(), false));
finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up finalizedReads.addAll(finalizeAndAdd(ConsensusType.BOTH)); // if it ended in running consensus, finish it up
} }
} }
return finalizedReads; return finalizedReads;
} }
@ -611,13 +732,96 @@ public class SlidingWindow {
} }
private List<GATKSAMRecord> createPolyploidConsensus(int start, int stop, int nHaplotypes, int hetRefPosition) {
// we will create two (positive strand, negative strand) headers for each contig
List<LinkedList<HeaderElement>> headersPosStrand = new ArrayList<LinkedList<HeaderElement>>();
List<LinkedList<HeaderElement>> headersNegStrand = new ArrayList<LinkedList<HeaderElement>>();
List<GATKSAMRecord> hetReads = new LinkedList<GATKSAMRecord>();
Map<Byte, Integer> haplotypeHeaderMap = new HashMap<Byte, Integer>(nHaplotypes);
int currentHaplotype = 0;
int refStart = windowHeader.get(start).getLocation();
int refStop = windowHeader.get(stop).getLocation();
List<GATKSAMRecord> toRemove = new LinkedList<GATKSAMRecord>();
for (GATKSAMRecord read : readsInWindow) {
int haplotype;
// check if the read is either before or inside the variant region
if (read.getSoftStart() <= refStop) {
// check if the read is inside the variant region
if (read.getMappingQuality() >= MIN_MAPPING_QUALITY && read.getSoftEnd() >= refStart) {
// check if the read contains the het site
if (read.getSoftStart() <= hetRefPosition && read.getSoftEnd() >= hetRefPosition) {
int readPos = ReadUtils.getReadCoordinateForReferenceCoordinate(read, hetRefPosition, ReadUtils.ClippingTail.LEFT_TAIL);
byte base = read.getReadBases()[readPos];
byte qual = read.getBaseQualities(EventType.BASE_SUBSTITUTION)[readPos];
// check if base passes the filters!
if (qual >= MIN_BASE_QUAL_TO_COUNT) {
// check which haplotype this read represents and take the index of it from the list of headers
if (haplotypeHeaderMap.containsKey(base)) {
haplotype = haplotypeHeaderMap.get(base);
}
// create new lists if this haplotype has not been seen yet
else {
haplotype = currentHaplotype;
haplotypeHeaderMap.put(base, currentHaplotype);
headersPosStrand.add(new LinkedList<HeaderElement>());
headersNegStrand.add(new LinkedList<HeaderElement>());
currentHaplotype++;
}
LinkedList<HeaderElement> header = read.getReadNegativeStrandFlag() ? headersNegStrand.get(haplotype) : headersPosStrand.get(haplotype);
// add to the polyploid header
addToHeader(header, read);
// remove from the standard header so that we don't double count it
removeFromHeader(windowHeader, read);
}
}
}
// we remove all reads before and inside the variant region from the window
toRemove.add(read);
}
}
for (LinkedList<HeaderElement> header : headersPosStrand) {
if (header.size() > 0)
hetReads.addAll(addToSyntheticReads(header, 0, header.size(), false));
if (runningConsensus != null)
hetReads.add(finalizeRunningConsensus());
}
for (LinkedList<HeaderElement> header : headersNegStrand) {
if (header.size() > 0)
hetReads.addAll(addToSyntheticReads(header, 0, header.size(), true));
if (runningConsensus != null)
hetReads.add(finalizeRunningConsensus());
}
for (GATKSAMRecord read : toRemove) {
readsInWindow.remove(read);
}
return hetReads;
}
private void addToHeader(LinkedList<HeaderElement> header, GATKSAMRecord read) {
updateHeaderCounts(header, read, false);
}
private void removeFromHeader(LinkedList<HeaderElement> header, GATKSAMRecord read) {
updateHeaderCounts(header, read, true);
}
/** /**
* Updates the sliding window's header counts with the incoming read bases, insertions * Updates the sliding window's header counts with the incoming read bases, insertions
* and deletions. * and deletions.
* *
* @param header the sliding window header to use
* @param read the incoming read to be added to the sliding window * @param read the incoming read to be added to the sliding window
* @param removeRead if we are removing the read from the header or adding
*/ */
protected void updateHeaderCounts(GATKSAMRecord read, boolean removeRead) { private void updateHeaderCounts(LinkedList<HeaderElement> header, GATKSAMRecord read, boolean removeRead) {
byte[] bases = read.getReadBases(); byte[] bases = read.getReadBases();
byte[] quals = read.getBaseQualities(); byte[] quals = read.getBaseQualities();
byte[] insQuals = read.getExistingBaseInsertionQualities(); byte[] insQuals = read.getExistingBaseInsertionQualities();
@ -627,8 +831,9 @@ public class SlidingWindow {
Cigar cigar = read.getCigar(); Cigar cigar = read.getCigar();
int readBaseIndex = 0; int readBaseIndex = 0;
int startLocation = getStartLocation(); int startLocation = getStartLocation(header);
int locationIndex = startLocation < 0 ? 0 : readStart - startLocation; int locationIndex = startLocation < 0 ? 0 : readStart - startLocation;
int stopLocation = getStopLocation(header);
if (removeRead && locationIndex < 0) if (removeRead && locationIndex < 0)
throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " start " + read.getUnclippedStart() + "," + read.getUnclippedEnd() + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation); throw new ReviewedStingException("read is behind the Sliding Window. read: " + read + " start " + read.getUnclippedStart() + "," + read.getUnclippedEnd() + " cigar: " + read.getCigarString() + " window: " + startLocation + "," + stopLocation);
@ -636,7 +841,7 @@ public class SlidingWindow {
if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it if (!removeRead) { // we only need to create new header elements if we are adding the read, not when we're removing it
if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window if (locationIndex < 0) { // Do we need to add extra elements before the start of the header? -- this may happen if the previous read was clipped and this alignment starts before the beginning of the window
for (int i = 1; i <= -locationIndex; i++) for (int i = 1; i <= -locationIndex; i++)
windowHeader.addFirst(new HeaderElement(startLocation - i)); header.addFirst(new HeaderElement(startLocation - i));
startLocation = readStart; // update start location accordingly startLocation = readStart; // update start location accordingly
locationIndex = 0; locationIndex = 0;
@ -645,19 +850,17 @@ public class SlidingWindow {
if (stopLocation < readEnd) { // Do we need to add extra elements to the header? if (stopLocation < readEnd) { // Do we need to add extra elements to the header?
int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation; int elementsToAdd = (stopLocation < 0) ? readEnd - readStart + 1 : readEnd - stopLocation;
while (elementsToAdd-- > 0) while (elementsToAdd-- > 0)
windowHeader.addLast(new HeaderElement(readEnd - elementsToAdd)); header.addLast(new HeaderElement(readEnd - elementsToAdd));
stopLocation = readEnd; // update stopLocation accordingly
} }
// Special case for leading insertions before the beginning of the sliding read // Special case for leading insertions before the beginning of the sliding read
if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) { if (ReadUtils.readStartsWithInsertion(read).getFirst() && (readStart == startLocation || startLocation < 0)) {
windowHeader.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added header.addFirst(new HeaderElement(readStart - 1)); // create a new first element to the window header with no bases added
locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing. locationIndex = 1; // This allows the first element (I) to look at locationIndex - 1 in the subsequent switch and do the right thing.
} }
} }
Iterator<HeaderElement> headerElementIterator = windowHeader.listIterator(locationIndex); Iterator<HeaderElement> headerElementIterator = header.listIterator(locationIndex);
HeaderElement headerElement; HeaderElement headerElement;
for (CigarElement cigarElement : cigar.getCigarElements()) { for (CigarElement cigarElement : cigar.getCigarElements()) {
switch (cigarElement.getOperator()) { switch (cigarElement.getOperator()) {
@ -668,7 +871,7 @@ public class SlidingWindow {
break; break;
} }
headerElement = windowHeader.get(locationIndex - 1); // insertions are added to the base to the left (previous element) headerElement = header.get(locationIndex - 1); // insertions are added to the base to the left (previous element)
if (removeRead) { if (removeRead) {
headerElement.removeInsertionToTheRight(); headerElement.removeInsertionToTheRight();

View File

@ -5,9 +5,9 @@ import net.sf.samtools.Cigar;
import net.sf.samtools.CigarElement; import net.sf.samtools.CigarElement;
import net.sf.samtools.CigarOperator; import net.sf.samtools.CigarOperator;
import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileHeader;
import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.recalibration.EventType;
import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -44,8 +44,9 @@ public class SyntheticRead {
private String contig; private String contig;
private int contigIndex; private int contigIndex;
private String readName; private String readName;
private Integer refStart; private int refStart;
private boolean hasIndelQualities = false; private boolean hasIndelQualities = false;
private boolean isNegativeStrand = false;
/** /**
* Full initialization of the running consensus if you have all the information and are ready to * Full initialization of the running consensus if you have all the information and are ready to
@ -59,7 +60,7 @@ public class SyntheticRead {
* @param refStart the alignment start (reference based) * @param refStart the alignment start (reference based)
* @param readTag the reduce reads tag for the synthetic read * @param readTag the reduce reads tag for the synthetic read
*/ */
public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, String readTag, boolean hasIndelQualities) { public SyntheticRead(SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, String readTag, boolean hasIndelQualities, boolean isNegativeRead) {
final int initialCapacity = 10000; final int initialCapacity = 10000;
bases = new ArrayList<BaseIndex>(initialCapacity); bases = new ArrayList<BaseIndex>(initialCapacity);
counts = new ArrayList<Byte>(initialCapacity); counts = new ArrayList<Byte>(initialCapacity);
@ -76,9 +77,10 @@ public class SyntheticRead {
this.readName = readName; this.readName = readName;
this.refStart = refStart; this.refStart = refStart;
this.hasIndelQualities = hasIndelQualities; this.hasIndelQualities = hasIndelQualities;
this.isNegativeStrand = isNegativeRead;
} }
public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, Integer refStart, boolean hasIndelQualities) { public SyntheticRead(List<BaseIndex> bases, List<Byte> counts, List<Byte> quals, List<Byte> insertionQuals, List<Byte> deletionQuals, double mappingQuality, String readTag, SAMFileHeader header, GATKSAMReadGroupRecord readGroupRecord, String contig, int contigIndex, String readName, int refStart, boolean hasIndelQualities, boolean isNegativeRead) {
this.bases = bases; this.bases = bases;
this.counts = counts; this.counts = counts;
this.quals = quals; this.quals = quals;
@ -93,6 +95,7 @@ public class SyntheticRead {
this.readName = readName; this.readName = readName;
this.refStart = refStart; this.refStart = refStart;
this.hasIndelQualities = hasIndelQualities; this.hasIndelQualities = hasIndelQualities;
this.isNegativeStrand = isNegativeRead;
} }
/** /**
@ -112,11 +115,15 @@ public class SyntheticRead {
this.mappingQuality += mappingQuality; this.mappingQuality += mappingQuality;
} }
public BaseIndex getBase(int readCoordinate) { public BaseIndex getBase(final int readCoordinate) {
return bases.get(readCoordinate); return bases.get(readCoordinate);
} }
/** public int getRefStart() {
return refStart;
}
/**
* Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid. * Creates a GATKSAMRecord of the synthetic read. Will return null if the read is invalid.
* *
* Invalid reads are : * Invalid reads are :
@ -133,6 +140,7 @@ public class SyntheticRead {
read.setReferenceIndex(contigIndex); read.setReferenceIndex(contigIndex);
read.setReadPairedFlag(false); read.setReadPairedFlag(false);
read.setReadUnmappedFlag(false); read.setReadUnmappedFlag(false);
read.setReadNegativeStrandFlag(isNegativeStrand);
read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions) read.setCigar(buildCigar()); // the alignment start may change while building the cigar (leading deletions)
read.setAlignmentStart(refStart); read.setAlignmentStart(refStart);
read.setReadName(readName); read.setReadName(readName);

View File

@ -1,11 +1,11 @@
package org.broadinstitute.sting.gatk.walkers.genotyper; package org.broadinstitute.sting.gatk.walkers.genotyper;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
@ -53,13 +53,14 @@ public class ErrorModel {
PairHMMIndelErrorModel pairModel = null; PairHMMIndelErrorModel pairModel = null;
LinkedHashMap<Allele, Haplotype> haplotypeMap = null; LinkedHashMap<Allele, Haplotype> haplotypeMap = null;
HashMap<PileupElement, LinkedHashMap<Allele, Double>> indelLikelihoodMap = null;
double[][] perReadLikelihoods = null; double[][] perReadLikelihoods = null;
double[] model = new double[maxQualityScore+1]; double[] model = new double[maxQualityScore+1];
Arrays.fill(model,Double.NEGATIVE_INFINITY); Arrays.fill(model,Double.NEGATIVE_INFINITY);
boolean hasCalledAlleles = false; boolean hasCalledAlleles = false;
final PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
if (refSampleVC != null) { if (refSampleVC != null) {
for (Allele allele : refSampleVC.getAlleles()) { for (Allele allele : refSampleVC.getAlleles()) {
@ -71,8 +72,7 @@ public class ErrorModel {
haplotypeMap = new LinkedHashMap<Allele, Haplotype>(); haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
if (refSampleVC.isIndel()) { if (refSampleVC.isIndel()) {
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
indelLikelihoodMap = new HashMap<PileupElement, LinkedHashMap<Allele, Double>>();
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(refSampleVC.getAlleles(), refContext, refContext.getLocus(), haplotypeMap); // will update haplotypeMap adding elements
} }
} }
@ -92,12 +92,12 @@ public class ErrorModel {
Allele refAllele = refSampleVC.getReference(); Allele refAllele = refSampleVC.getReference();
if (refSampleVC.isIndel()) { if ( refSampleVC.isIndel()) {
final int readCounts[] = new int[refSamplePileup.getNumberOfElements()]; final int readCounts[] = new int[refSamplePileup.getNumberOfElements()];
//perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()]; //perReadLikelihoods = new double[readCounts.length][refSampleVC.getAlleles().size()];
final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles()); final int eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(refSampleVC.getAlleles());
if (!haplotypeMap.isEmpty()) if (!haplotypeMap.isEmpty())
perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, indelLikelihoodMap, readCounts); perReadLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(refSamplePileup,haplotypeMap,refContext, eventLength, perReadAlleleLikelihoodMap, readCounts);
} }
int idx = 0; int idx = 0;
for (PileupElement refPileupElement : refSamplePileup) { for (PileupElement refPileupElement : refSamplePileup) {
@ -195,8 +195,8 @@ public class ErrorModel {
if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength) if (eventLength < 0 && pileupElement.isBeforeDeletionStart() && pileupElement.getEventLength() == -eventLength)
return true; return true;
if (eventLength > 0 && pileupElement.isBeforeInsertion() && if (eventLength > 0 && pileupElement.isBeforeInsertion() &&
Arrays.equals(pileupElement.getEventBases().getBytes(),alleleBases)) Arrays.equals(pileupElement.getEventBases().getBytes(),Arrays.copyOfRange(alleleBases,1,alleleBases.length))) // allele contains ref byte, but pileupElement's event bases doesn't
return true; return true;
return false; return false;

View File

@ -26,6 +26,8 @@
package org.broadinstitute.sting.gatk.walkers.genotyper; package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils; import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACcounts;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
@ -123,7 +125,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
* *
* *
*/ */
protected static class SumIterator { public static class SumIterator {
private int[] currentState; private int[] currentState;
private final int[] finalState; private final int[] finalState;
private final int restrictSumTo; private final int restrictSumTo;
@ -491,32 +493,32 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors // If neighbors fall below maximum - threshold, we don't queue up THEIR own neighbors
// and we repeat until queue is empty // and we repeat until queue is empty
// queue of AC conformations to process // queue of AC conformations to process
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue = new LinkedList<AlleleFrequencyCalculationModel.ExactACset>(); final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects // mapping of ExactACset indexes to the objects
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset = new HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset>(likelihoodDim); final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(likelihoodDim);
// add AC=0 to the queue // add AC=0 to the queue
final int[] zeroCounts = new int[nAlleles]; final int[] zeroCounts = new int[nAlleles];
zeroCounts[0] = numChromosomes; zeroCounts[0] = numChromosomes;
AlleleFrequencyCalculationModel.ExactACset zeroSet = ExactACset zeroSet =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(zeroCounts)); new ExactACset(1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet); ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated // keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY; double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) { while ( !ACqueue.isEmpty() ) {
// compute log10Likelihoods // compute log10Likelihoods
final AlleleFrequencyCalculationModel.ExactACset ACset = ACqueue.remove(); final ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup); final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, errorModel, alleleList, numObservations, maxLog10L, ACqueue, indexesToACset, pileup);
// adjust max likelihood seen if needed // adjust max likelihood seen if needed
maxLog10L = Math.max(maxLog10L, log10LofKs); maxLog10L = Math.max(maxLog10L, log10LofKs);
// clean up memory // clean up memory
indexesToACset.remove(ACset.ACcounts); indexesToACset.remove(ACset.getACcounts());
if ( VERBOSE ) if ( VERBOSE )
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts); System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
} }
@ -525,13 +527,13 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
int plIdx = 0; int plIdx = 0;
SumIterator iterator = new SumIterator(nAlleles, numChromosomes); SumIterator iterator = new SumIterator(nAlleles, numChromosomes);
while (iterator.hasNext()) { while (iterator.hasNext()) {
AlleleFrequencyCalculationModel.ExactACset ACset = ExactACset ACset =
new AlleleFrequencyCalculationModel.ExactACset(1, new AlleleFrequencyCalculationModel.ExactACcounts(iterator.getCurrentVector())); new ExactACset(1, new ExactACcounts(iterator.getCurrentVector()));
// for observed base X, add Q(jX,k) to likelihood vector for all k in error model // for observed base X, add Q(jX,k) to likelihood vector for all k in error model
//likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k)) //likelihood(jA,jC,jG,jT) = logsum(logPr (errorModel[k],nA*Q(jA,k) + nC*Q(jC,k) + nG*Q(jG,k) + nT*Q(jT,k))
getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup); getLikelihoodOfConformation(ACset, errorModel, alleleList, numObservations, pileup);
setLogPLs(plIdx++, ACset.log10Likelihoods[0]); setLogPLs(plIdx++, ACset.getLog10Likelihoods()[0]);
iterator.next(); iterator.next();
} }
} }
@ -540,40 +542,40 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
} }
private double calculateACConformationAndUpdateQueue(final ExactAFCalculationModel.ExactACset set, private double calculateACConformationAndUpdateQueue(final ExactACset set,
final ErrorModel errorModel, final ErrorModel errorModel,
final List<Allele> alleleList, final List<Allele> alleleList,
final List<Integer> numObservations, final List<Integer> numObservations,
final double maxLog10L, final double maxLog10L,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue, final LinkedList<ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, final HashMap<ExactACcounts,
AlleleFrequencyCalculationModel.ExactACset> indexesToACset, ExactACset> indexesToACset,
final ReadBackedPileup pileup) { final ReadBackedPileup pileup) {
// compute likelihood of set // compute likelihood of set
getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup); getLikelihoodOfConformation(set, errorModel, alleleList, numObservations, pileup);
final double log10LofK = set.log10Likelihoods[0]; final double log10LofK = set.getLog10Likelihoods()[0];
// log result in PL vector // log result in PL vector
int idx = getLinearIndex(set.ACcounts.getCounts(), nAlleles, numChromosomes); int idx = getLinearIndex(set.getACcounts().getCounts(), nAlleles, numChromosomes);
setLogPLs(idx, log10LofK); setLogPLs(idx, log10LofK);
// can we abort early because the log10Likelihoods are so small? // can we abort early because the log10Likelihoods are so small?
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) {
if ( VERBOSE ) if ( VERBOSE )
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L); System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.getACcounts(), log10LofK, maxLog10L);
return log10LofK; return log10LofK;
} }
// iterate over higher frequencies if possible // iterate over higher frequencies if possible
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
final int ACwiggle = numChromosomes - set.getACsum() + set.ACcounts.counts[0]; final int ACwiggle = numChromosomes - set.getACsum() + set.getACcounts().getCounts()[0];
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK; return log10LofK;
// add conformations for other cases // add conformations for other cases
for ( int allele = 1; allele < nAlleles; allele++ ) { for ( int allele = 1; allele < nAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone(); final int[] ACcountsClone = set.getACcounts().getCounts().clone();
ACcountsClone[allele]++; ACcountsClone[allele]++;
// is this a valid conformation? // is this a valid conformation?
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
@ -597,7 +599,7 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
* @param numObservations Number of observations for each allele * @param numObservations Number of observations for each allele
* @param pileup Read backed pileup in case it's necessary * @param pileup Read backed pileup in case it's necessary
*/ */
public abstract void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, public abstract void getLikelihoodOfConformation(final ExactACset ACset,
final ErrorModel errorModel, final ErrorModel errorModel,
final List<Allele> alleleList, final List<Allele> alleleList,
final List<Integer> numObservations, final List<Integer> numObservations,
@ -608,12 +610,12 @@ public abstract class GeneralPloidyGenotypeLikelihoods {
// Static methods // Static methods
public static void updateACset(final int[] newSetCounts, public static void updateACset(final int[] newSetCounts,
final LinkedList<AlleleFrequencyCalculationModel.ExactACset> ACqueue, final LinkedList<ExactACset> ACqueue,
final HashMap<AlleleFrequencyCalculationModel.ExactACcounts, AlleleFrequencyCalculationModel.ExactACset> indexesToACset) { final HashMap<ExactACcounts, ExactACset> indexesToACset) {
final AlleleFrequencyCalculationModel.ExactACcounts index = new AlleleFrequencyCalculationModel.ExactACcounts(newSetCounts); final ExactACcounts index = new ExactACcounts(newSetCounts);
if ( !indexesToACset.containsKey(index) ) { if ( !indexesToACset.containsKey(index) ) {
AlleleFrequencyCalculationModel.ExactACset newSet = new AlleleFrequencyCalculationModel.ExactACset(1, index); ExactACset newSet = new ExactACset(1, index);
indexesToACset.put(index, newSet); indexesToACset.put(index, newSet);
ACqueue.add(newSet); ACqueue.add(newSet);
if (VERBOSE) if (VERBOSE)

View File

@ -41,15 +41,6 @@ import java.util.*;
public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel { public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends GenotypeLikelihoodsCalculationModel {
//protected Set<String> laneIDs;
public enum Model {
SNP,
INDEL,
POOLSNP,
POOLINDEL,
BOTH
}
final protected UnifiedArgumentCollection UAC; final protected UnifiedArgumentCollection UAC;
protected GeneralPloidyGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) { protected GeneralPloidyGenotypeLikelihoodsCalculationModel(UnifiedArgumentCollection UAC, Logger logger) {
@ -203,7 +194,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
final AlignmentContextUtils.ReadOrientation contextType, final AlignmentContextUtils.ReadOrientation contextType,
final List<Allele> allAllelesToUse, final List<Allele> allAllelesToUse,
final boolean useBAQedPileup, final boolean useBAQedPileup,
final GenomeLocParser locParser) { final GenomeLocParser locParser,
final Map<String, org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap> perReadAlleleLikelihoodMap) {
HashMap<String, ErrorModel> perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts); HashMap<String, ErrorModel> perLaneErrorModels = getPerLaneErrorModels(tracker, ref, contexts);
if (perLaneErrorModels == null && UAC.referenceSampleName != null) if (perLaneErrorModels == null && UAC.referenceSampleName != null)
@ -215,8 +207,11 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
newContext.put(DUMMY_SAMPLE_NAME,mergedContext); newContext.put(DUMMY_SAMPLE_NAME,mergedContext);
contexts = newContext; contexts = newContext;
} }
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
// get initial alleles to genotype // starting a new site: clear allele list
perReadAlleleLikelihoodMap.clear(); // clean mapping sample-> per read, per allele likelihoods
}
// get initial alleles to genotype
final List<Allele> allAlleles = new ArrayList<Allele>(); final List<Allele> allAlleles = new ArrayList<Allele>();
if (allAllelesToUse == null || allAllelesToUse.isEmpty()) if (allAllelesToUse == null || allAllelesToUse.isEmpty())
allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse)); allAlleles.addAll(getInitialAllelesToUse(tracker, ref,contexts,contextType,locParser, allAllelesToUse));
@ -234,9 +229,13 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
continue; continue;
ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup(); ReadBackedPileup pileup = AlignmentContextUtils.stratify(sample.getValue(), contextType).getBasePileup();
if (!perReadAlleleLikelihoodMap.containsKey(sample.getKey())){
// no likelihoods have been computed for this sample at this site
perReadAlleleLikelihoodMap.put(sample.getKey(), org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap());
}
// create the GenotypeLikelihoods object // create the GenotypeLikelihoods object
final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO); final GeneralPloidyGenotypeLikelihoods GL = getPoolGenotypeLikelihoodObject(allAlleles, null, UAC.samplePloidy, perLaneErrorModels, useBAQedPileup, ref, UAC.IGNORE_LANE_INFO, perReadAlleleLikelihoodMap.get(sample.getKey()));
// actually compute likelihoods // actually compute likelihoods
final int nGoodBases = GL.add(pileup, UAC); final int nGoodBases = GL.add(pileup, UAC);
if ( nGoodBases > 0 ) if ( nGoodBases > 0 )
@ -246,7 +245,7 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
// find the alternate allele(s) that we should be using // find the alternate allele(s) that we should be using
final List<Allele> alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs); final List<Allele> alleles = getFinalAllelesToUse(tracker, ref, allAllelesToUse, GLs);
if (alleles == null || alleles.isEmpty()) if (alleles == null || alleles.isEmpty() || (alleles.size() == 1 && alleles.get(0).isReference()))
return null; return null;
// start making the VariantContext // start making the VariantContext
final GenomeLoc loc = ref.getLocus(); final GenomeLoc loc = ref.getLocus();
@ -333,7 +332,8 @@ public abstract class GeneralPloidyGenotypeLikelihoodsCalculationModel extends G
final HashMap<String, ErrorModel> perLaneErrorModels, final HashMap<String, ErrorModel> perLaneErrorModels,
final boolean useBQAedPileup, final boolean useBQAedPileup,
final ReferenceContext ref, final ReferenceContext ref,
final boolean ignoreLaneInformation); final boolean ignoreLaneInformation,
final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap);
protected abstract List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker, protected abstract List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
final ReferenceContext ref, final ReferenceContext ref,

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper; package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel;
import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.Haplotype;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
@ -26,6 +27,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
double[][] readHaplotypeLikelihoods; double[][] readHaplotypeLikelihoods;
final byte refBase; final byte refBase;
final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap;
public GeneralPloidyIndelGenotypeLikelihoods(final List<Allele> alleles, public GeneralPloidyIndelGenotypeLikelihoods(final List<Allele> alleles,
final double[] logLikelihoods, final double[] logLikelihoods,
@ -34,7 +36,8 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
final boolean ignoreLaneInformation, final boolean ignoreLaneInformation,
final PairHMMIndelErrorModel pairModel, final PairHMMIndelErrorModel pairModel,
final LinkedHashMap<Allele, Haplotype> haplotypeMap, final LinkedHashMap<Allele, Haplotype> haplotypeMap,
final ReferenceContext referenceContext) { final ReferenceContext referenceContext,
final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap) {
super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation); super(alleles, logLikelihoods, ploidy, perLaneErrorModels, ignoreLaneInformation);
this.pairModel = pairModel; this.pairModel = pairModel;
this.haplotypeMap = haplotypeMap; this.haplotypeMap = haplotypeMap;
@ -42,6 +45,7 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles); this.eventLength = IndelGenotypeLikelihoodsCalculationModel.getEventLength(alleles);
// todo - not needed if indel alleles have base at current position // todo - not needed if indel alleles have base at current position
this.refBase = referenceContext.getBase(); this.refBase = referenceContext.getBase();
this.perReadAlleleLikelihoodMap = perReadAlleleLikelihoodMap;
} }
// ------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------
@ -142,8 +146,9 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
List<Integer> numSeenBases = new ArrayList<Integer>(this.alleles.size()); List<Integer> numSeenBases = new ArrayList<Integer>(this.alleles.size());
if (!hasReferenceSampleData) { if (!hasReferenceSampleData) {
final int readCounts[] = new int[pileup.getNumberOfElements()]; final int readCounts[] = new int[pileup.getNumberOfElements()];
readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap(), readCounts); readHaplotypeLikelihoods = pairModel.computeGeneralReadHaplotypeLikelihoods(pileup, haplotypeMap, refContext, eventLength, perReadAlleleLikelihoodMap, readCounts);
n = readHaplotypeLikelihoods.length; n = readHaplotypeLikelihoods.length;
} else { } else {
Allele refAllele = null; Allele refAllele = null;
@ -184,12 +189,12 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
* @param alleleList List of alleles * @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList * @param numObservations Number of observations for each allele in alleleList
*/ */
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, public void getLikelihoodOfConformation(final ExactACset ACset,
final ErrorModel errorModel, final ErrorModel errorModel,
final List<Allele> alleleList, final List<Allele> alleleList,
final List<Integer> numObservations, final List<Integer> numObservations,
final ReadBackedPileup pileup) { final ReadBackedPileup pileup) {
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, alleleList.size()); final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), alleleList.size());
double p1 = 0.0; double p1 = 0.0;
if (!hasReferenceSampleData) { if (!hasReferenceSampleData) {
@ -214,6 +219,6 @@ public class GeneralPloidyIndelGenotypeLikelihoods extends GeneralPloidyGenotype
} }
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec); p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ, maxQ), acVec);
} }
ACset.log10Likelihoods[0] = p1; ACset.getLog10Likelihoods()[0] = p1;
} }
} }

View File

@ -62,7 +62,7 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY, pairModel = new PairHMMIndelErrorModel(UAC.INDEL_GAP_OPEN_PENALTY, UAC.INDEL_GAP_CONTINUATION_PENALTY,
UAC.OUTPUT_DEBUG_INDEL_INFO, !UAC.DONT_DO_BANDED_INDEL_COMPUTATION); UAC.OUTPUT_DEBUG_INDEL_INFO, UAC.pairHMM);
haplotypeMap = new LinkedHashMap<Allele, Haplotype>(); haplotypeMap = new LinkedHashMap<Allele, Haplotype>();
} }
@ -73,8 +73,9 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
final HashMap<String, ErrorModel> perLaneErrorModels, final HashMap<String, ErrorModel> perLaneErrorModels,
final boolean useBQAedPileup, final boolean useBQAedPileup,
final ReferenceContext ref, final ReferenceContext ref,
final boolean ignoreLaneInformation){ final boolean ignoreLaneInformation,
return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref); final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){
return new GeneralPloidyIndelGenotypeLikelihoods(alleles, logLikelihoods, ploidy,perLaneErrorModels,ignoreLaneInformation, pairModel, haplotypeMap, ref, perReadAlleleLikelihoodMap);
} }
protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker, protected List<Allele> getInitialAllelesToUse(final RefMetaDataTracker tracker,
@ -90,7 +91,6 @@ public class GeneralPloidyIndelGenotypeLikelihoodsCalculationModel extends Gener
if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE) if (alleles.size() > MAX_NUM_ALLELES_TO_GENOTYPE)
alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE); alleles = alleles.subList(0,MAX_NUM_ALLELES_TO_GENOTYPE);
if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) { if (contextType == AlignmentContextUtils.ReadOrientation.COMPLETE) {
IndelGenotypeLikelihoodsCalculationModel.getIndelLikelihoodMap().clear();
haplotypeMap.clear(); haplotypeMap.clear();
} }
IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap); IndelGenotypeLikelihoodsCalculationModel.getHaplotypeMapFromAlleles(alleles, ref, ref.getLocus(), haplotypeMap);

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper;
import net.sf.samtools.SAMUtils; import net.sf.samtools.SAMUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.BAQ;
@ -12,7 +13,10 @@ import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl; import org.broadinstitute.sting.utils.pileup.ReadBackedPileupImpl;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.util.*; import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import static java.lang.Math.log10; import static java.lang.Math.log10;
import static java.lang.Math.pow; import static java.lang.Math.pow;
@ -218,12 +222,12 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
* @param alleleList List of alleles * @param alleleList List of alleles
* @param numObservations Number of observations for each allele in alleleList * @param numObservations Number of observations for each allele in alleleList
*/ */
public void getLikelihoodOfConformation(final AlleleFrequencyCalculationModel.ExactACset ACset, public void getLikelihoodOfConformation(final ExactACset ACset,
final ErrorModel errorModel, final ErrorModel errorModel,
final List<Allele> alleleList, final List<Allele> alleleList,
final List<Integer> numObservations, final List<Integer> numObservations,
final ReadBackedPileup pileup) { final ReadBackedPileup pileup) {
final int[] currentCnt = Arrays.copyOf(ACset.ACcounts.counts, BaseUtils.BASES.length); final int[] currentCnt = Arrays.copyOf(ACset.getACcounts().getCounts(), BaseUtils.BASES.length);
final int[] ac = new int[BaseUtils.BASES.length]; final int[] ac = new int[BaseUtils.BASES.length];
for (int k=0; k < BaseUtils.BASES.length; k++ ) for (int k=0; k < BaseUtils.BASES.length; k++ )
@ -238,9 +242,9 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
final byte qual = qualToUse(elt, true, true, mbq); final byte qual = qualToUse(elt, true, true, mbq);
if ( qual == 0 ) if ( qual == 0 )
continue; continue;
final double acc[] = new double[ACset.ACcounts.counts.length]; final double acc[] = new double[ACset.getACcounts().getCounts().length];
for (int k=0; k < acc.length; k++ ) for (int k=0; k < acc.length; k++ )
acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.ACcounts.counts[k]] acc[k] = qualLikelihoodCache[BaseUtils.simpleBaseToBaseIndex(alleleList.get(k).getBases()[0])][BaseUtils.simpleBaseToBaseIndex(obsBase)][qual] +MathUtils.log10Cache[ACset.getACcounts().getCounts()[k]]
- LOG10_PLOIDY; - LOG10_PLOIDY;
p1 += MathUtils.log10sumLog10(acc); p1 += MathUtils.log10sumLog10(acc);
} }
@ -264,7 +268,7 @@ public class GeneralPloidySNPGenotypeLikelihoods extends GeneralPloidyGenotypeLi
p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec); p1 = MathUtils.logDotProduct(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ), acVec);
} }
ACset.log10Likelihoods[0] = p1; ACset.getLog10Likelihoods()[0] = p1;
/* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1)); /* System.out.println(Arrays.toString(ACset.ACcounts.getCounts())+" "+String.valueOf(p1));
System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ))); System.out.println(Arrays.toString(errorModel.getErrorModelVector().getProbabilityVector(minQ,maxQ)));
*/ */

View File

@ -49,7 +49,8 @@ public class GeneralPloidySNPGenotypeLikelihoodsCalculationModel extends General
final HashMap<String, ErrorModel> perLaneErrorModels, final HashMap<String, ErrorModel> perLaneErrorModels,
final boolean useBQAedPileup, final boolean useBQAedPileup,
final ReferenceContext ref, final ReferenceContext ref,
final boolean ignoreLaneInformation) { final boolean ignoreLaneInformation,
final org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap perReadAlleleLikelihoodMap){
return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO); return new GeneralPloidySNPGenotypeLikelihoods(alleles, null, UAC.samplePloidy, perLaneErrorModels, useBQAedPileup, UAC.IGNORE_LANE_INFO);
} }

View File

@ -0,0 +1,315 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.log4j.ConsoleAppender;
import org.apache.log4j.Logger;
import org.apache.log4j.TTCCLayout;
import org.broadinstitute.sting.gatk.report.GATKReport;
import org.broadinstitute.sting.gatk.report.GATKReportTable;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import java.io.*;
import java.util.*;
/**
* A simple GATK utility (i.e, runs from command-line) for assessing the performance of
* the exact model
*/
public class AFCalcPerformanceTest {
final static Logger logger = Logger.getLogger(AFCalcPerformanceTest.class);
private static abstract class Analysis {
final GATKReport report;
public Analysis(final String name, final List<String> columns) {
report = GATKReport.newSimpleReport(name, columns);
}
public abstract void run(final AFCalcTestBuilder testBuilder,
final List<Object> coreColumns);
public String getName() {
return getTable().getTableName();
}
public GATKReportTable getTable() {
return report.getTables().iterator().next();
}
}
private static class AnalyzeByACAndPL extends Analysis {
public AnalyzeByACAndPL(final List<String> columns) {
super("AnalyzeByACAndPL", Utils.append(columns, "non.type.pls", "ac", "n.alt.seg", "other.ac"));
}
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
for ( int[] ACs : makeACs(testBuilder.numAltAlleles, testBuilder.nSamples*2) ) {
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
timer.start();
final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
final long runtime = timer.getElapsedTimeNano();
int otherAC = 0;
int nAltSeg = 0;
for ( int i = 0; i < ACs.length; i++ ) {
nAltSeg += ACs[i] > 0 ? 1 : 0;
if ( i > 0 ) otherAC += ACs[i];
}
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, ACs[0], nAltSeg, otherAC));
report.addRowList(columns);
}
}
}
private List<int[]> makeACs(final int nAltAlleles, final int nChrom) {
if ( nAltAlleles > 2 ) throw new IllegalArgumentException("nAltAlleles must be < 3");
final List<int[]> ACs = new LinkedList<int[]>();
final List<Integer> ACsToTry = MathUtils.log10LinearRange(0, nChrom, 0.1); //Arrays.asList(0, 1, 2, 3, 6, 10, 20, 40, 60, 100, 200, 400, 600, 1000, 2000, 4000, 6000, 10000, 100000);
for ( int i : ACsToTry ) {
if ( i < nChrom ) {
if ( nAltAlleles == 1 ) {
ACs.add(new int[]{i});
} else if ( nAltAlleles == 2 ) {
for ( int j : ACsToTry ) {
if ( j < nChrom - i )
ACs.add(new int[]{i, j});
}
} else {
throw new IllegalStateException("cannot get here");
}
}
}
return ACs;
}
}
private static class AnalyzeBySingletonPosition extends Analysis {
public AnalyzeBySingletonPosition(final List<String> columns) {
super("AnalyzeBySingletonPosition", Utils.append(columns, "non.type.pls", "position.of.singleton"));
}
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final int[] ac = new int[testBuilder.numAltAlleles];
ac[0] = 1;
final VariantContext vc = testBuilder.makeACTest(ac, 0, nonTypePL);
for ( final int position : MathUtils.log10LinearRange(0, vc.getNSamples(), 0.1) ) {
final VariantContextBuilder vcb = new VariantContextBuilder(vc);
final List<Genotype> genotypes = new ArrayList<Genotype>(vc.getGenotypes());
Collections.rotate(genotypes, position);
vcb.genotypes(genotypes);
timer.start();
final AFCalcResult resultTracker = calc.getLog10PNonRef(vcb.make(), priors);
final long runtime = timer.getElapsedTimeNano();
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, position));
report.addRowList(columns);
}
}
}
}
private static class AnalyzeByNonInformative extends Analysis {
public AnalyzeByNonInformative(final List<String> columns) {
super("AnalyzeByNonInformative", Utils.append(columns, "non.type.pls", "n.non.informative"));
}
public void run(final AFCalcTestBuilder testBuilder, final List<Object> coreValues) {
final SimpleTimer timer = new SimpleTimer();
for ( final int nonTypePL : Arrays.asList(100) ) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final int[] ac = new int[testBuilder.numAltAlleles];
ac[0] = 1;
for ( int nNonInformative = 0; nNonInformative < testBuilder.nSamples; nNonInformative++ ) {
final VariantContext vc = testBuilder.makeACTest(ac, nNonInformative, nonTypePL);
timer.start();
final AFCalcResult resultTracker = calc.getLog10PNonRef(vc, priors);
final long runtime = timer.getElapsedTimeNano();
final List<Object> columns = new LinkedList<Object>(coreValues);
columns.addAll(Arrays.asList(runtime, resultTracker.getnEvaluations(), nonTypePL, nNonInformative));
report.addRowList(columns);
}
}
}
}
private static class ModelParams {
final AFCalcFactory.Calculation modelType;
final int maxBiNSamples, maxTriNSamples;
private ModelParams(AFCalcFactory.Calculation modelType, int maxBiNSamples, int maxTriNSamples) {
this.modelType = modelType;
this.maxBiNSamples = maxBiNSamples;
this.maxTriNSamples = maxTriNSamples;
}
public boolean meetsConstraints(final int nAltAlleles, final int nSamples) {
if ( nAltAlleles == 1 )
return nSamples <= maxBiNSamples;
else if ( nAltAlleles == 2 )
return nSamples <= maxTriNSamples;
else
throw new IllegalStateException("Unexpected number of alt alleles " + nAltAlleles);
}
}
public enum Operation {
ANALYZE,
SINGLE,
EXACT_LOG
}
public static void main(final String[] args) throws Exception {
final TTCCLayout layout = new TTCCLayout();
layout.setThreadPrinting(false);
layout.setCategoryPrefixing(false);
layout.setContextPrinting(false);
logger.addAppender(new ConsoleAppender(layout));
final Operation op = Operation.valueOf(args[0]);
switch ( op ) {
case ANALYZE: analyze(args); break;
case SINGLE: profileBig(args); break;
case EXACT_LOG: exactLog(args); break;
default: throw new IllegalAccessException("unknown operation " + op);
}
}
private static void exactLog(final String[] args) throws Exception {
final File ref = new File(args[1]);
final File exactLogFile = new File(args[2]);
final List<Integer> startsToUse = new LinkedList<Integer>();
for ( int i = 3; i < args.length; i++ )
startsToUse.add(Integer.valueOf(args[i]));
final CachingIndexedFastaSequenceFile seq = new CachingIndexedFastaSequenceFile(ref);
final GenomeLocParser parser = new GenomeLocParser(seq);
final BufferedReader reader = new BufferedReader(new FileReader(exactLogFile));
final List<ExactCallLogger.ExactCall> loggedCalls = ExactCallLogger.readExactLog(reader, startsToUse, parser);
for ( final ExactCallLogger.ExactCall call : loggedCalls ) {
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(call.vc.getNSamples(), 1,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcTestBuilder.PriorType.human);
logger.info(call);
final SimpleTimer timer = new SimpleTimer().start();
final AFCalcResult result = testBuilder.makeModel().getLog10PNonRef(call.vc, testBuilder.makePriors());
final long newNanoTime = timer.getElapsedTimeNano();
if ( call.originalCall.anyPolymorphic(-1) || result.anyPolymorphic(-1) ) {
logger.info("**** ONE IS POLY");
}
logger.info("\t\t getLog10PosteriorOfAFGT0: " + call.originalCall.getLog10PosteriorOfAFGT0() + " vs " + result.getLog10PosteriorOfAFGT0());
final double speedup = call.runtime / (1.0 * newNanoTime);
logger.info("\t\t runtime: " + call.runtime + " vs " + newNanoTime + " speedup " + String.format("%.2f", speedup) + "x");
for ( final Allele a : call.originalCall.getAllelesUsedInGenotyping() ) {
if ( a.isNonReference() ) {
final String warningmeMLE = call.originalCall.getAlleleCountAtMLE(a) != result.getAlleleCountAtMLE(a) ? " DANGER-MLE-DIFFERENT" : "";
logger.info("\t\t MLE " + a + ": " + call.originalCall.getAlleleCountAtMLE(a) + " vs " + result.getAlleleCountAtMLE(a) + warningmeMLE);
final String warningmePost = call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) == 0 && result.getLog10PosteriorOfAFGt0ForAllele(a) < -10 ? " DANGER-POSTERIORS-DIFFERENT" : "";
logger.info("\t\t Posterior " + a + ": " + call.originalCall.getLog10PosteriorOfAFGt0ForAllele(a) + " vs " + result.getLog10PosteriorOfAFGt0ForAllele(a) + warningmePost);
}
}
}
}
private static void profileBig(final String[] args) throws Exception {
final int nSamples = Integer.valueOf(args[1]);
final int ac = Integer.valueOf(args[2]);
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(nSamples, 1,
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcTestBuilder.PriorType.human);
final VariantContext vc = testBuilder.makeACTest(new int[]{ac}, 0, 100);
final SimpleTimer timer = new SimpleTimer().start();
final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vc, testBuilder.makePriors());
final long runtime = timer.getElapsedTimeNano();
logger.info("result " + resultTracker.getLog10PosteriorOfAFGT0());
logger.info("runtime " + runtime);
}
private static void analyze(final String[] args) throws Exception {
final List<String> coreColumns = Arrays.asList("iteration", "n.alt.alleles", "n.samples",
"exact.model", "prior.type", "runtime", "n.evaluations");
final PrintStream out = new PrintStream(new FileOutputStream(args[1]));
final List<ModelParams> modelParams = Arrays.asList(
new ModelParams(AFCalcFactory.Calculation.EXACT_REFERENCE, 10000, 10),
// new ModelParams(AFCalcTestBuilder.ModelType.GeneralExact, 100, 10),
new ModelParams(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 10000, 1000));
final boolean ONLY_HUMAN_PRIORS = false;
final List<AFCalcTestBuilder.PriorType> priorTypes = ONLY_HUMAN_PRIORS
? Arrays.asList(AFCalcTestBuilder.PriorType.values())
: Arrays.asList(AFCalcTestBuilder.PriorType.human);
final List<Analysis> analyzes = new ArrayList<Analysis>();
analyzes.add(new AnalyzeByACAndPL(coreColumns));
analyzes.add(new AnalyzeBySingletonPosition(coreColumns));
//analyzes.add(new AnalyzeByNonInformative(coreColumns));
for ( int iteration = 0; iteration < 1; iteration++ ) {
for ( final int nAltAlleles : Arrays.asList(1, 2) ) {
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
for ( final ModelParams modelToRun : modelParams) {
if ( modelToRun.meetsConstraints(nAltAlleles, nSamples) ) {
for ( final AFCalcTestBuilder.PriorType priorType : priorTypes ) {
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, nAltAlleles, modelToRun.modelType, priorType);
for ( final Analysis analysis : analyzes ) {
logger.info(Utils.join("\t", Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType, analysis.getName())));
final List<?> values = Arrays.asList(iteration, nAltAlleles, nSamples, modelToRun.modelType, priorType);
analysis.run(testBuilder, (List<Object>)values);
}
}
}
}
}
}
}
final GATKReport report = new GATKReport();
for ( final Analysis analysis : analyzes )
report.addTable(analysis.getTable());
report.print(out);
out.close();
}
}

View File

@ -0,0 +1,174 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class AFCalcTestBuilder {
final static Allele A = Allele.create("A", true);
final static Allele C = Allele.create("C");
final static Allele G = Allele.create("G");
final static Allele T = Allele.create("T");
final static Allele AA = Allele.create("AA");
final static Allele AT = Allele.create("AT");
final static Allele AG = Allele.create("AG");
static int sampleNameCounter = 0;
final int nSamples;
final int numAltAlleles;
final AFCalcFactory.Calculation modelType;
final PriorType priorType;
public AFCalcTestBuilder(final int nSamples, final int numAltAlleles,
final AFCalcFactory.Calculation modelType, final PriorType priorType) {
this.nSamples = nSamples;
this.numAltAlleles = numAltAlleles;
this.modelType = modelType;
this.priorType = priorType;
}
@Override
public String toString() {
return String.format("AFCalcTestBuilder nSamples=%d nAlts=%d model=%s prior=%s", nSamples, numAltAlleles, modelType, priorType);
}
public enum PriorType {
flat,
human
}
public int getNumAltAlleles() {
return numAltAlleles;
}
public int getnSamples() {
return nSamples;
}
public AFCalc makeModel() {
return AFCalcFactory.createAFCalc(modelType, nSamples, getNumAltAlleles(), 2);
}
public double[] makePriors() {
final int nPriorValues = 2*nSamples+1;
switch ( priorType ) {
case flat:
return MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
case human:
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
return humanPriors;
default:
throw new RuntimeException("Unexpected type " + priorType);
}
}
public VariantContext makeACTest(final List<Integer> ACs, final int nNonInformative, final int nonTypePL) {
return makeACTest(ArrayUtils.toPrimitive(ACs.toArray(new Integer[]{})), nNonInformative, nonTypePL);
}
public VariantContext makeACTest(final int[] ACs, final int nNonInformative, final int nonTypePL) {
final int nChrom = nSamples * 2;
final int[] nhet = new int[numAltAlleles];
final int[] nhomvar = new int[numAltAlleles];
for ( int i = 0; i < ACs.length; i++ ) {
final double p = ACs[i] / (1.0 * nChrom);
nhomvar[i] = (int)Math.floor((nSamples - nNonInformative) * p * p);
nhet[i] = ACs[i] - 2 * nhomvar[i];
if ( nhet[i] < 0 )
throw new IllegalStateException("Bug! nhet[i] < 0");
}
final long calcAC = MathUtils.sum(nhet) + 2 * MathUtils.sum(nhomvar);
if ( calcAC != MathUtils.sum(ACs) )
throw new IllegalStateException("calculated AC " + calcAC + " not equal to desired AC " + Utils.join(",", ACs));
return makeACTest(nhet, nhomvar, nNonInformative, nonTypePL);
}
public VariantContext makeACTest(final int[] nhet, final int[] nhomvar, final int nNonInformative, final int nonTypePL) {
List<Genotype> samples = new ArrayList<Genotype>(nSamples);
for ( int altI = 0; altI < nhet.length; altI++ ) {
for ( int i = 0; i < nhet[altI]; i++ )
samples.add(makePL(GenotypeType.HET, nonTypePL, altI+1));
for ( int i = 0; i < nhomvar[altI]; i++ )
samples.add(makePL(GenotypeType.HOM_VAR, nonTypePL, altI+1));
}
final Genotype nonInformative = makeNonInformative();
samples.addAll(Collections.nCopies(nNonInformative, nonInformative));
final int nRef = Math.max((int) (nSamples - nNonInformative - MathUtils.sum(nhet) - MathUtils.sum(nhomvar)), 0);
samples.addAll(Collections.nCopies(nRef, makePL(GenotypeType.HOM_REF, nonTypePL, 0)));
samples = samples.subList(0, nSamples);
if ( samples.size() > nSamples )
throw new IllegalStateException("too many samples");
VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, getAlleles());
vcb.genotypes(samples);
return vcb.make();
}
public List<Allele> getAlleles() {
return Arrays.asList(A, C, G, T, AA, AT, AG).subList(0, numAltAlleles+1);
}
public List<Allele> getAlleles(final GenotypeType type, final int altI) {
switch (type) {
case HOM_REF: return Arrays.asList(getAlleles().get(0), getAlleles().get(0));
case HET: return Arrays.asList(getAlleles().get(0), getAlleles().get(altI));
case HOM_VAR: return Arrays.asList(getAlleles().get(altI), getAlleles().get(altI));
default: throw new IllegalArgumentException("Unexpected type " + type);
}
}
public Genotype makePL(final List<Allele> expectedGT, int ... pls) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(expectedGT);
gb.PL(pls);
return gb.make();
}
private int numPLs() {
return GenotypeLikelihoods.numLikelihoods(numAltAlleles+1, 2);
}
public Genotype makeNonInformative() {
final int[] nonInformativePLs = new int[GenotypeLikelihoods.numLikelihoods(numAltAlleles, 2)];
return makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), nonInformativePLs);
}
public Genotype makePL(final GenotypeType type, final int nonTypePL, final int altI) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(getAlleles(type, altI));
final int[] pls = new int[numPLs()];
Arrays.fill(pls, nonTypePL);
int index = 0;
switch ( type ) {
case HOM_REF: index = GenotypeLikelihoods.calculatePLindex(0, 0); break;
case HET: index = GenotypeLikelihoods.calculatePLindex(0, altI); break;
case HOM_VAR: index = GenotypeLikelihoods.calculatePLindex(altI, altI); break;
}
pls[index] = 0;
gb.PL(pls);
return gb.make();
}
}

View File

@ -23,56 +23,52 @@
* THE USE OR OTHER DEALINGS IN THE SOFTWARE. * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
package org.broadinstitute.sting.gatk.walkers.genotyper; package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.variantcontext.*; import org.broadinstitute.sting.utils.variantcontext.*;
import java.io.PrintStream;
import java.util.*; import java.util.*;
public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalculationModel { public class GeneralPloidyExactAFCalc extends ExactAFCalc {
static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them static final int MAX_LENGTH_FOR_POOL_PL_LOGGING = 10; // if PL vectors longer than this # of elements, don't log them
final protected UnifiedArgumentCollection UAC;
private final int ploidy; private final int ploidy;
private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6 private final static double MAX_LOG10_ERROR_TO_STOP_EARLY = 6; // we want the calculation to be accurate to 1 / 10^6
private final static boolean VERBOSE = false; private final static boolean VERBOSE = false;
protected GeneralPloidyExactAFCalculationModel(UnifiedArgumentCollection UAC, int N, Logger logger, PrintStream verboseWriter) { protected GeneralPloidyExactAFCalc(final int nSamples, final int maxAltAlleles, final int ploidy) {
super(UAC, N, logger, verboseWriter); super(nSamples, maxAltAlleles, ploidy);
ploidy = UAC.samplePloidy; this.ploidy = ploidy;
this.UAC = UAC;
} }
public List<Allele> getLog10PNonRef(final VariantContext vc, @Override
final double[] log10AlleleFrequencyPriors, protected VariantContext reduceScope(VariantContext vc) {
final AlleleFrequencyCalculationResult result) {
GenotypesContext GLs = vc.getGenotypes();
List<Allele> alleles = vc.getAlleles();
// don't try to genotype too many alternate alleles // don't try to genotype too many alternate alleles
if ( vc.getAlternateAlleles().size() > MAX_ALTERNATE_ALLELES_TO_GENOTYPE ) { if ( vc.getAlternateAlleles().size() > getMaxAltAlleles()) {
logger.warn("this tool is currently set to genotype at most " + MAX_ALTERNATE_ALLELES_TO_GENOTYPE + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument"); logger.warn("this tool is currently set to genotype at most " + getMaxAltAlleles() + " alternate alleles in a given context, but the context at " + vc.getChr() + ":" + vc.getStart() + " has " + (vc.getAlternateAlleles().size()) + " alternate alleles so only the top alleles will be used; see the --max_alternate_alleles argument");
alleles = new ArrayList<Allele>(MAX_ALTERNATE_ALLELES_TO_GENOTYPE + 1); final List<Allele> alleles = new ArrayList<Allele>(getMaxAltAlleles() + 1);
alleles.add(vc.getReference()); alleles.add(vc.getReference());
alleles.addAll(chooseMostLikelyAlternateAlleles(vc, MAX_ALTERNATE_ALLELES_TO_GENOTYPE, ploidy)); alleles.addAll(chooseMostLikelyAlternateAlleles(vc, getMaxAltAlleles(), ploidy));
VariantContextBuilder builder = new VariantContextBuilder(vc);
GLs = subsetAlleles(vc, alleles, false, ploidy); builder.alleles(alleles);
builder.genotypes(subsetAlleles(vc, alleles, false, ploidy));
return builder.make();
} else {
return vc;
} }
combineSinglePools(GLs, alleles.size(), ploidy, log10AlleleFrequencyPriors, result);
return alleles;
} }
@Override
public AFCalcResult computeLog10PNonRef(final VariantContext vc, final double[] log10AlleleFrequencyPriors) {
combineSinglePools(vc.getGenotypes(), vc.getNAlleles(), ploidy, log10AlleleFrequencyPriors);
return getResultFromFinalState(vc, log10AlleleFrequencyPriors);
}
/** /**
* Simple wrapper class to hold values of combined pool likelihoods. * Simple wrapper class to hold values of combined pool likelihoods.
@ -94,8 +90,8 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
public void add(ExactACset set) { public void add(ExactACset set) {
alleleCountSetList.add(set); alleleCountSetList.add(set);
conformationMap.put(set.ACcounts, set); conformationMap.put(set.getACcounts(), set);
final double likelihood = set.log10Likelihoods[0]; final double likelihood = set.getLog10Likelihoods()[0];
if (likelihood > maxLikelihood ) if (likelihood > maxLikelihood )
maxLikelihood = likelihood; maxLikelihood = likelihood;
@ -108,11 +104,11 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
} }
public double getLikelihoodOfConformation(int[] ac) { public double getLikelihoodOfConformation(int[] ac) {
return conformationMap.get(new ExactACcounts(ac)).log10Likelihoods[0]; return conformationMap.get(new ExactACcounts(ac)).getLog10Likelihoods()[0];
} }
public double getGLOfACZero() { public double getGLOfACZero() {
return alleleCountSetList.get(0).log10Likelihoods[0]; // AC 0 is always at beginning of list return alleleCountSetList.get(0).getLog10Likelihoods()[0]; // AC 0 is always at beginning of list
} }
public int getLength() { public int getLength() {
@ -129,6 +125,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* @return list of numAllelesToChoose most likely alleles * @return list of numAllelesToChoose most likely alleles
*/ */
private static final int PL_INDEX_OF_HOM_REF = 0;
private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) { private static List<Allele> chooseMostLikelyAlternateAlleles(VariantContext vc, int numAllelesToChoose, int ploidy) {
final int numOriginalAltAlleles = vc.getAlternateAlleles().size(); final int numOriginalAltAlleles = vc.getAlternateAlleles().size();
final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles]; final LikelihoodSum[] likelihoodSums = new LikelihoodSum[numOriginalAltAlleles];
@ -136,7 +133,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i)); likelihoodSums[i] = new LikelihoodSum(vc.getAlternateAllele(i));
// based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype // based on the GLs, find the alternate alleles with the most probability; sum the GLs for the most likely genotype
final ArrayList<double[]> GLs = getGLs(vc.getGenotypes()); final ArrayList<double[]> GLs = getGLs(vc.getGenotypes(), false);
for ( final double[] likelihoods : GLs ) { for ( final double[] likelihoods : GLs ) {
final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods); final int PLindexOfBestGL = MathUtils.maxElementIndex(likelihoods);
@ -144,7 +141,7 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
// by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele // by convention, first count coming from getAlleleCountFromPLIndex comes from reference allele
for (int k=1; k < acCount.length;k++) { for (int k=1; k < acCount.length;k++) {
if (acCount[k] > 0) if (acCount[k] > 0)
likelihoodSums[k-1].sum += likelihoods[PLindexOfBestGL]; likelihoodSums[k-1].sum += acCount[k] * (likelihoods[PLindexOfBestGL] - likelihoods[PL_INDEX_OF_HOM_REF]);
} }
} }
@ -171,15 +168,13 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* @param numAlleles Number of alternate alleles * @param numAlleles Number of alternate alleles
* @param ploidyPerPool Number of samples per pool * @param ploidyPerPool Number of samples per pool
* @param log10AlleleFrequencyPriors Frequency priors * @param log10AlleleFrequencyPriors Frequency priors
* @param result object to fill with output values
*/ */
protected static void combineSinglePools(final GenotypesContext GLs, protected void combineSinglePools(final GenotypesContext GLs,
final int numAlleles, final int numAlleles,
final int ploidyPerPool, final int ploidyPerPool,
final double[] log10AlleleFrequencyPriors, final double[] log10AlleleFrequencyPriors) {
final AlleleFrequencyCalculationResult result) {
final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs); final ArrayList<double[]> genotypeLikelihoods = getGLs(GLs, true);
int combinedPloidy = 0; int combinedPloidy = 0;
@ -190,23 +185,30 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
// first element: zero ploidy, e.g. trivial degenerate distribution // first element: zero ploidy, e.g. trivial degenerate distribution
final int[] zeroCounts = new int[numAlleles]; final int[] zeroCounts = new int[numAlleles];
final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts)); final ExactACset set = new ExactACset(1, new ExactACcounts(zeroCounts));
set.log10Likelihoods[0] = 0.0; set.getLog10Likelihoods()[0] = 0.0;
combinedPoolLikelihoods.add(set); combinedPoolLikelihoods.add(set);
for (int p=1; p<genotypeLikelihoods.size(); p++) {
result.reset(); if ( genotypeLikelihoods.size() <= 1 ) {
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p), combinedPloidy, ploidyPerPool, // no meaningful GLs at all, just set the tracker to non poly values
numAlleles, log10AlleleFrequencyPriors, result); getStateTracker().reset(); // just mimic-ing call below
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods getStateTracker().setLog10LikelihoodOfAFzero(0.0);
} else {
for (int p=1; p<genotypeLikelihoods.size(); p++) {
getStateTracker().reset(); // TODO -- why is this here? It makes it hard to track the n evaluation
combinedPoolLikelihoods = fastCombineMultiallelicPool(combinedPoolLikelihoods, genotypeLikelihoods.get(p),
combinedPloidy, ploidyPerPool, numAlleles, log10AlleleFrequencyPriors);
combinedPloidy = ploidyPerPool + combinedPloidy; // total number of chromosomes in combinedLikelihoods
}
} }
} }
public static CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool, double[] newGL, int originalPloidy, int newGLPloidy, int numAlleles, public CombinedPoolLikelihoods fastCombineMultiallelicPool(final CombinedPoolLikelihoods originalPool,
final double[] log10AlleleFrequencyPriors, double[] newGL,
final AlleleFrequencyCalculationResult result) { int originalPloidy,
int newGLPloidy,
int numAlleles,
final double[] log10AlleleFrequencyPriors) {
final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>(); final LinkedList<ExactACset> ACqueue = new LinkedList<ExactACset>();
// mapping of ExactACset indexes to the objects // mapping of ExactACset indexes to the objects
final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>(); final HashMap<ExactACcounts, ExactACset> indexesToACset = new HashMap<ExactACcounts, ExactACset>();
@ -220,19 +222,19 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts)); ExactACset zeroSet = new ExactACset(1, new ExactACcounts(zeroCounts));
ACqueue.add(zeroSet); ACqueue.add(zeroSet);
indexesToACset.put(zeroSet.ACcounts, zeroSet); indexesToACset.put(zeroSet.getACcounts(), zeroSet);
// keep processing while we have AC conformations that need to be calculated // keep processing while we have AC conformations that need to be calculated
double maxLog10L = Double.NEGATIVE_INFINITY;
while ( !ACqueue.isEmpty() ) { while ( !ACqueue.isEmpty() ) {
getStateTracker().incNEvaluations();
// compute log10Likelihoods // compute log10Likelihoods
final ExactACset ACset = ACqueue.remove(); final ExactACset ACset = ACqueue.remove();
final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, result, maxLog10L, ACqueue, indexesToACset); final double log10LofKs = calculateACConformationAndUpdateQueue(ACset, newPool, originalPool, newGL, log10AlleleFrequencyPriors, originalPloidy, newGLPloidy, ACqueue, indexesToACset);
maxLog10L = Math.max(maxLog10L, log10LofKs);
// clean up memory // clean up memory
indexesToACset.remove(ACset.ACcounts); indexesToACset.remove(ACset.getACcounts());
if ( VERBOSE ) if ( VERBOSE )
System.out.printf(" *** removing used set=%s%n", ACset.ACcounts); System.out.printf(" *** removing used set=%s%n", ACset.getACcounts());
} }
return newPool; return newPool;
@ -248,51 +250,46 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* @param log10AlleleFrequencyPriors Prior object * @param log10AlleleFrequencyPriors Prior object
* @param originalPloidy Total ploidy of original combined pool * @param originalPloidy Total ploidy of original combined pool
* @param newGLPloidy Ploidy of GL vector * @param newGLPloidy Ploidy of GL vector
* @param result AFResult object
* @param maxLog10L max likelihood observed so far
* @param ACqueue Queue of conformations to compute * @param ACqueue Queue of conformations to compute
* @param indexesToACset AC indices of objects in queue * @param indexesToACset AC indices of objects in queue
* @return max log likelihood * @return max log likelihood
*/ */
private static double calculateACConformationAndUpdateQueue(final ExactACset set, private double calculateACConformationAndUpdateQueue(final ExactACset set,
final CombinedPoolLikelihoods newPool, final CombinedPoolLikelihoods newPool,
final CombinedPoolLikelihoods originalPool, final CombinedPoolLikelihoods originalPool,
final double[] newGL, final double[] newGL,
final double[] log10AlleleFrequencyPriors, final double[] log10AlleleFrequencyPriors,
final int originalPloidy, final int originalPloidy,
final int newGLPloidy, final int newGLPloidy,
final AlleleFrequencyCalculationResult result, final LinkedList<ExactACset> ACqueue,
final double maxLog10L, final HashMap<ExactACcounts, ExactACset> indexesToACset) {
final LinkedList<ExactACset> ACqueue,
final HashMap<ExactACcounts, ExactACset> indexesToACset) {
// compute likeihood in "set" of new set based on original likelihoods // compute likeihood in "set" of new set based on original likelihoods
final int numAlleles = set.ACcounts.counts.length; final int numAlleles = set.getACcounts().getCounts().length;
final int newPloidy = set.getACsum(); final int newPloidy = set.getACsum();
final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy, result); final double log10LofK = computeLofK(set, originalPool, newGL, log10AlleleFrequencyPriors, numAlleles, originalPloidy, newGLPloidy);
// add to new pool // add to new pool
if (!Double.isInfinite(log10LofK)) if (!Double.isInfinite(log10LofK))
newPool.add(set); newPool.add(set);
if ( log10LofK < maxLog10L - MAX_LOG10_ERROR_TO_STOP_EARLY ) { // TODO -- change false to true this correct line when the implementation of this model is optimized (it's too slow now to handle this fix)
if ( VERBOSE ) if ( getStateTracker().abort(log10LofK, set.getACcounts(), false) ) {
System.out.printf(" *** breaking early set=%s log10L=%.2f maxLog10L=%.2f%n", set.ACcounts, log10LofK, maxLog10L);
return log10LofK; return log10LofK;
} }
// iterate over higher frequencies if possible // iterate over higher frequencies if possible
// by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count. // by convention, ACcounts contained in set have full vector of possible pool ac counts including ref count.
// so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space // so, if first element is zero, it automatically means we have no wiggle since we're in a corner of the conformation space
final int ACwiggle = set.ACcounts.counts[0]; final int ACwiggle = set.getACcounts().getCounts()[0];
if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies if ( ACwiggle == 0 ) // all alternate alleles already sum to 2N so we cannot possibly go to higher frequencies
return log10LofK; return log10LofK;
// add conformations for other cases // add conformations for other cases
for ( int allele = 1; allele < numAlleles; allele++ ) { for ( int allele = 1; allele < numAlleles; allele++ ) {
final int[] ACcountsClone = set.ACcounts.getCounts().clone(); final int[] ACcountsClone = set.getACcounts().getCounts().clone();
ACcountsClone[allele]++; ACcountsClone[allele]++;
// is this a valid conformation? // is this a valid conformation?
int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0]; int altSum = (int)MathUtils.sum(ACcountsClone) - ACcountsClone[0];
@ -309,67 +306,67 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
} }
/** // /**
* Naive combiner of two multiallelic pools - number of alt alleles must be the same. // * Naive combiner of two multiallelic pools - number of alt alleles must be the same.
* Math is generalization of biallelic combiner. // * Math is generalization of biallelic combiner.
* // *
* For vector K representing an allele count conformation, // * For vector K representing an allele count conformation,
* Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K) // * Pr(D | AC = K) = Sum_G Pr(D|AC1 = G) Pr (D|AC2=K-G) * F(G,K)
* where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...]) // * where F(G,K) = choose(m1,[g0 g1 ...])*choose(m2,[...]) / choose(m1+m2,[k1 k2 ...])
* @param originalPool First log-likelihood pool GL vector // * @param originalPool First log-likelihood pool GL vector
* @param yy Second pool GL vector // * @param yy Second pool GL vector
* @param ploidy1 Ploidy of first pool (# of chromosomes in it) // * @param ploidy1 Ploidy of first pool (# of chromosomes in it)
* @param ploidy2 Ploidy of second pool // * @param ploidy2 Ploidy of second pool
* @param numAlleles Number of alleles // * @param numAlleles Number of alleles
* @param log10AlleleFrequencyPriors Array of biallelic priors // * @param log10AlleleFrequencyPriors Array of biallelic priors
* @param result Af calculation result object // * @param resultTracker Af calculation result object
*/ // */
public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles, // public static void combineMultiallelicPoolNaively(CombinedPoolLikelihoods originalPool, double[] yy, int ploidy1, int ploidy2, int numAlleles,
final double[] log10AlleleFrequencyPriors, // final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) { // final AFCalcResultTracker resultTracker) {
/* ///*
final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1); // final int dim1 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy1);
final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2); // final int dim2 = GenotypeLikelihoods.numLikelihoods(numAlleles, ploidy2);
//
if (dim1 != originalPool.getLength() || dim2 != yy.length) // if (dim1 != originalPool.getLength() || dim2 != yy.length)
throw new ReviewedStingException("BUG: Inconsistent vector length"); // throw new ReviewedStingException("BUG: Inconsistent vector length");
//
if (ploidy2 == 0) // if (ploidy2 == 0)
return; // return;
//
final int newPloidy = ploidy1 + ploidy2; // final int newPloidy = ploidy1 + ploidy2;
//
// Say L1(K) = Pr(D|AC1=K) * choose(m1,K) // // Say L1(K) = Pr(D|AC1=K) * choose(m1,K)
// and L2(K) = Pr(D|AC2=K) * choose(m2,K) // // and L2(K) = Pr(D|AC2=K) * choose(m2,K)
GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1); // GeneralPloidyGenotypeLikelihoods.SumIterator firstIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy1);
final double[] x = originalPool.getLikelihoodsAsVector(true); // final double[] x = originalPool.getLikelihoodsAsVector(true);
while(firstIterator.hasNext()) { // while(firstIterator.hasNext()) {
x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector()); // x[firstIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy1,firstIterator.getCurrentVector());
firstIterator.next(); // firstIterator.next();
} // }
//
GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); // GeneralPloidyGenotypeLikelihoods.SumIterator secondIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
final double[] y = yy.clone(); // final double[] y = yy.clone();
while(secondIterator.hasNext()) { // while(secondIterator.hasNext()) {
y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector()); // y[secondIterator.getLinearIndex()] += MathUtils.log10MultinomialCoefficient(ploidy2,secondIterator.getCurrentVector());
secondIterator.next(); // secondIterator.next();
} // }
//
// initialize output to -log10(choose(m1+m2,[k1 k2...]) // // initialize output to -log10(choose(m1+m2,[k1 k2...])
final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy); // final int outputDim = GenotypeLikelihoods.numLikelihoods(numAlleles, newPloidy);
final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy); // final GeneralPloidyGenotypeLikelihoods.SumIterator outputIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,newPloidy);
//
//
// Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K // // Now, result(K) = logSum_G (L1(G)+L2(K-G)) where G are all possible vectors that sum UP to K
while(outputIterator.hasNext()) { // while(outputIterator.hasNext()) {
final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector())); // final ExactACset set = new ExactACset(1, new ExactACcounts(outputIterator.getCurrentAltVector()));
double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result); // double likelihood = computeLofK(set, x,y, log10AlleleFrequencyPriors, numAlleles, ploidy1, ploidy2, result);
//
originalPool.add(likelihood, set, outputIterator.getLinearIndex()); // originalPool.add(likelihood, set, outputIterator.getLinearIndex());
outputIterator.next(); // outputIterator.next();
} // }
*/ //*/
} // }
/** /**
* Compute likelihood of a particular AC conformation and update AFresult object * Compute likelihood of a particular AC conformation and update AFresult object
@ -380,15 +377,13 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* @param numAlleles Number of alleles (including ref) * @param numAlleles Number of alleles (including ref)
* @param ploidy1 Ploidy of original pool (combined) * @param ploidy1 Ploidy of original pool (combined)
* @param ploidy2 Ploidy of new pool * @param ploidy2 Ploidy of new pool
* @param result AFResult object
* @return log-likehood of requested conformation * @return log-likehood of requested conformation
*/ */
private static double computeLofK(final ExactACset set, private double computeLofK(final ExactACset set,
final CombinedPoolLikelihoods firstGLs, final CombinedPoolLikelihoods firstGLs,
final double[] secondGL, final double[] secondGL,
final double[] log10AlleleFrequencyPriors, final double[] log10AlleleFrequencyPriors,
final int numAlleles, final int ploidy1, final int ploidy2, final int numAlleles, final int ploidy1, final int ploidy2) {
final AlleleFrequencyCalculationResult result) {
final int newPloidy = ploidy1 + ploidy2; final int newPloidy = ploidy1 + ploidy2;
@ -397,17 +392,18 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
if (newPloidy != totalAltK) if (newPloidy != totalAltK)
throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values"); throw new ReviewedStingException("BUG: inconsistent sizes of set.getACsum and passed ploidy values");
totalAltK -= set.ACcounts.counts[0]; totalAltK -= set.getACcounts().getCounts()[0];
// totalAltK has sum of alt alleles of conformation now // totalAltK has sum of alt alleles of conformation now
// special case for k = 0 over all k // special case for k = 0 over all k
if ( totalAltK == 0 ) { // all-ref case if ( totalAltK == 0 ) { // all-ref case
final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX]; final double log10Lof0 = firstGLs.getGLOfACZero() + secondGL[HOM_REF_INDEX];
set.log10Likelihoods[0] = log10Lof0; set.getLog10Likelihoods()[0] = log10Lof0;
result.setLog10LikelihoodOfAFzero(log10Lof0); getStateTracker().setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]); getStateTracker().setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
return log10Lof0;
} else { } else {
@ -415,12 +411,12 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
// ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy. // ExactACset holds by convention the conformation of all alleles, and the sum of all allele count is just the ploidy.
// To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i // To compute n!/k1!k2!k3!... we need to compute first n!/(k2!k3!...) and then further divide by k1! where k1=ploidy-sum_k_i
int[] currentCount = set.ACcounts.getCounts(); int[] currentCount = set.getACcounts().getCounts();
double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount); double denom = -MathUtils.log10MultinomialCoefficient(newPloidy, currentCount);
// for current conformation, get all possible ways to break vector K into two components G1 and G2 // for current conformation, get all possible ways to break vector K into two components G1 and G2
final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2); final GeneralPloidyGenotypeLikelihoods.SumIterator innerIterator = new GeneralPloidyGenotypeLikelihoods.SumIterator(numAlleles,ploidy2);
set.log10Likelihoods[0] = Double.NEGATIVE_INFINITY; set.getLog10Likelihoods()[0] = Double.NEGATIVE_INFINITY;
while (innerIterator.hasNext()) { while (innerIterator.hasNext()) {
// check if breaking current conformation into g1 and g2 is feasible. // check if breaking current conformation into g1 and g2 is feasible.
final int[] acCount2 = innerIterator.getCurrentVector(); final int[] acCount2 = innerIterator.getCurrentVector();
@ -436,27 +432,29 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2); final double num2 = MathUtils.log10MultinomialCoefficient(ploidy2, acCount2);
final double sum = firstGL + gl2 + num1 + num2; final double sum = firstGL + gl2 + num1 + num2;
set.log10Likelihoods[0] = MathUtils.approximateLog10SumLog10(set.log10Likelihoods[0], sum); set.getLog10Likelihoods()[0] = MathUtils.approximateLog10SumLog10(set.getLog10Likelihoods()[0], sum);
} }
} }
innerIterator.next(); innerIterator.next();
} }
set.log10Likelihoods[0] += denom; set.getLog10Likelihoods()[0] += denom;
} }
double log10LofK = set.log10Likelihoods[0]; double log10LofK = set.getLog10Likelihoods()[0];
// update the MLE if necessary // update the MLE if necessary
final int altCounts[] = Arrays.copyOfRange(set.ACcounts.counts,1, set.ACcounts.counts.length); final int altCounts[] = Arrays.copyOfRange(set.getACcounts().getCounts(),1, set.getACcounts().getCounts().length);
result.updateMLEifNeeded(log10LofK, altCounts); // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
getStateTracker().updateMLEifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
// apply the priors over each alternate allele // apply the priors over each alternate allele
for (final int ACcount : altCounts ) { for (final int ACcount : altCounts ) {
if ( ACcount > 0 ) if ( ACcount > 0 )
log10LofK += log10AlleleFrequencyPriors[ACcount]; log10LofK += log10AlleleFrequencyPriors[ACcount];
} }
result.updateMAPifNeeded(log10LofK, altCounts); // TODO -- GUILLERMO THIS CODE MAY PRODUCE POSITIVE LIKELIHOODS OR -INFINITY
getStateTracker().updateMAPifNeeded(Math.max(log10LofK, -Double.MAX_VALUE), altCounts);
return log10LofK; return log10LofK;
} }
@ -479,99 +477,6 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
return (sum == ploidy); return (sum == ploidy);
} }
/**
* Combines naively two biallelic pools (of arbitrary size).
* For two pools of size m1 and m2, we can compute the combined likelihood as:
* Pr(D|AC=k) = Sum_{j=0}^k Pr(D|AC1=j) Pr(D|AC2=k-j) * choose(m1,j)*choose(m2,k-j)/choose(m1+m2,k)
* @param originalPool Pool likelihood vector, x[k] = Pr(AC_i = k) for alt allele i
* @param newPLVector Second GL vector
* @param ploidy1 Ploidy of first pool (# of chromosomes in it)
* @param ploidy2 Ploidy of second pool
* @param log10AlleleFrequencyPriors Array of biallelic priors
* @param result Af calculation result object
* @return Combined likelihood vector
*/
public static ProbabilityVector combineBiallelicPoolsNaively(final ProbabilityVector originalPool, final double[] newPLVector,
final int ploidy1, final int ploidy2, final double[] log10AlleleFrequencyPriors,
final AlleleFrequencyCalculationResult result) {
final int newPloidy = ploidy1 + ploidy2;
final double[] combinedLikelihoods = new double[1+newPloidy];
/** Pre-fill result array and incorporate weights into input vectors
* Say L1(k) = Pr(D|AC1=k) * choose(m1,k)
* and L2(k) = Pr(D|AC2=k) * choose(m2,k)
* equation reduces to
* Pr(D|AC=k) = 1/choose(m1+m2,k) * Sum_{j=0}^k L1(k) L2(k-j)
* which is just plain convolution of L1 and L2 (with pre-existing vector)
*/
// intialize result vector to -infinity
Arrays.fill(combinedLikelihoods,Double.NEGATIVE_INFINITY);
final double[] x = Arrays.copyOf(originalPool.getProbabilityVector(),1+ploidy1);
for (int k=originalPool.getProbabilityVector().length; k< x.length; k++)
x[k] = Double.NEGATIVE_INFINITY;
final double[] y = newPLVector.clone();
final double log10Lof0 = x[0]+y[0];
result.setLog10LikelihoodOfAFzero(log10Lof0);
result.setLog10PosteriorOfAFzero(log10Lof0 + log10AlleleFrequencyPriors[0]);
double maxElement = log10Lof0;
int maxElementIdx = 0;
int[] alleleCounts = new int[1];
for (int k= originalPool.getMinVal() ; k <= newPloidy; k++) {
double[] acc = new double[k+1];
Arrays.fill(acc,Double.NEGATIVE_INFINITY);
double innerMax = Double.NEGATIVE_INFINITY;
for (int j=0; j <=k; j++) {
double x1,y1;
if (k-j>=0 && k-j < y.length)
y1 = y[k-j] + MathUtils.log10BinomialCoefficient(ploidy2,k-j);
else
continue;
if (j < x.length)
x1 = x[j] + MathUtils.log10BinomialCoefficient(ploidy1,j);
else
continue;
if (Double.isInfinite(x1) || Double.isInfinite(y1))
continue;
acc[j] = x1 + y1;
if (acc[j] > innerMax)
innerMax = acc[j];
else if (acc[j] < innerMax - MAX_LOG10_ERROR_TO_STOP_EARLY)
break;
}
combinedLikelihoods[k] = MathUtils.log10sumLog10(acc) - MathUtils.log10BinomialCoefficient(newPloidy,k);
maxElementIdx = k;
double maxDiff = combinedLikelihoods[k] - maxElement;
if (maxDiff > 0)
maxElement = combinedLikelihoods[k];
else if (maxDiff < maxElement - MAX_LOG10_ERROR_TO_STOP_EARLY) {
break;
}
alleleCounts[0] = k;
result.updateMLEifNeeded(combinedLikelihoods[k],alleleCounts);
result.updateMAPifNeeded(combinedLikelihoods[k] + log10AlleleFrequencyPriors[k],alleleCounts);
}
return new ProbabilityVector(MathUtils.normalizeFromLog10(Arrays.copyOf(combinedLikelihoods,maxElementIdx+1),false, true));
}
/** /**
* From a given variant context, extract a given subset of alleles, and update genotype context accordingly, * From a given variant context, extract a given subset of alleles, and update genotype context accordingly,
* including updating the PL's, and assign genotypes accordingly * including updating the PL's, and assign genotypes accordingly
@ -614,7 +519,10 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
// create the new likelihoods array from the alleles we are allowed to use // create the new likelihoods array from the alleles we are allowed to use
final double[] originalLikelihoods = g.getLikelihoods().getAsVector(); final double[] originalLikelihoods = g.getLikelihoods().getAsVector();
double[] newLikelihoods; double[] newLikelihoods;
if ( numOriginalAltAlleles == numNewAltAlleles) {
// Optimization: if # of new alt alleles = 0 (pure ref call), keep original likelihoods so we skip normalization
// and subsetting
if ( numOriginalAltAlleles == numNewAltAlleles || numNewAltAlleles == 0) {
newLikelihoods = originalLikelihoods; newLikelihoods = originalLikelihoods;
} else { } else {
newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse); newLikelihoods = GeneralPloidyGenotypeLikelihoods.subsetToAlleles(originalLikelihoods, ploidy, vc.getAlleles(), allelesToUse);
@ -657,10 +565,10 @@ public class GeneralPloidyExactAFCalculationModel extends AlleleFrequencyCalcula
* *
* @return genotype * @return genotype
*/ */
private static void assignGenotype(final GenotypeBuilder gb, private void assignGenotype(final GenotypeBuilder gb,
final double[] newLikelihoods, final double[] newLikelihoods,
final List<Allele> allelesToUse, final List<Allele> allelesToUse,
final int numChromosomes) { final int numChromosomes) {
final int numNewAltAlleles = allelesToUse.size() - 1; final int numNewAltAlleles = allelesToUse.size() - 1;

View File

@ -2,6 +2,9 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import org.jgrapht.graph.DefaultDirectedGraph; import org.jgrapht.graph.DefaultDirectedGraph;
import java.io.Serializable;
import java.util.Comparator;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
* User: ebanks * User: ebanks
@ -9,7 +12,7 @@ import org.jgrapht.graph.DefaultDirectedGraph;
*/ */
// simple edge class for connecting nodes in the graph // simple edge class for connecting nodes in the graph
public class DeBruijnEdge implements Comparable<DeBruijnEdge> { public class DeBruijnEdge {
private int multiplicity; private int multiplicity;
private boolean isRef; private boolean isRef;
@ -53,8 +56,10 @@ public class DeBruijnEdge implements Comparable<DeBruijnEdge> {
return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge))); return (graph.getEdgeSource(this).equals(graph2.getEdgeSource(edge))) && (graph.getEdgeTarget(this).equals(graph2.getEdgeTarget(edge)));
} }
@Override public static class EdgeWeightComparator implements Comparator<DeBruijnEdge>, Serializable {
public int compareTo( final DeBruijnEdge that ) { @Override
return this.multiplicity - that.multiplicity; public int compare(final DeBruijnEdge edge1, final DeBruijnEdge edge2) {
return edge1.multiplicity - edge2.multiplicity;
}
} }
} }

View File

@ -14,7 +14,7 @@ public class DeBruijnVertex {
public final int kmer; public final int kmer;
public DeBruijnVertex( final byte[] sequence, final int kmer ) { public DeBruijnVertex( final byte[] sequence, final int kmer ) {
this.sequence = sequence; this.sequence = sequence.clone();
this.kmer = kmer; this.kmer = kmer;
} }
@ -37,7 +37,7 @@ public class DeBruijnVertex {
} }
public byte[] getSequence() { public byte[] getSequence() {
return sequence; return sequence.clone();
} }
public byte[] getSuffix() { public byte[] getSuffix() {

View File

@ -52,7 +52,11 @@ public class GenotypingEngine {
noCall.add(Allele.NO_CALL); noCall.add(Allele.NO_CALL);
} }
// This function is the streamlined approach, currently not being used // WARN
// This function is the streamlined approach, currently not being used by default
// WARN
// WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code.
// WARN
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine,
final ArrayList<Haplotype> haplotypes, final ArrayList<Haplotype> haplotypes,
@ -184,6 +188,7 @@ public class GenotypingEngine {
return returnCalls; return returnCalls;
} }
// BUGBUG: Create a class to hold this complicated return type
@Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"})
public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, public List<Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine,
final ArrayList<Haplotype> haplotypes, final ArrayList<Haplotype> haplotypes,
@ -210,14 +215,9 @@ public class GenotypingEngine {
System.out.println( ">> Events = " + h.getEventMap()); System.out.println( ">> Events = " + h.getEventMap());
} }
} }
// Create the VC merge priority list
final ArrayList<String> priorityList = new ArrayList<String>();
for( int iii = 0; iii < haplotypes.size(); iii++ ) {
priorityList.add("HC" + iii);
}
cleanUpSymbolicUnassembledEvents( haplotypes, priorityList ); cleanUpSymbolicUnassembledEvents( haplotypes );
if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 3 ) { // if not in GGA mode and have at least 3 samples try to create MNP and complex events by looking at LD structure if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure
mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc );
} }
if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode! if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode!
@ -229,13 +229,16 @@ public class GenotypingEngine {
// Walk along each position in the key set and create each event to be outputted // Walk along each position in the key set and create each event to be outputted
for( final int loc : startPosKeySet ) { for( final int loc : startPosKeySet ) {
if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) {
final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); final ArrayList<VariantContext> eventsAtThisLoc = new ArrayList<VariantContext>(); // the overlapping events to merge into a common reference view
final ArrayList<String> priorityList = new ArrayList<String>(); // used to merge overlapping events into common reference view
if( activeAllelesToGenotype.isEmpty() ) { if( activeAllelesToGenotype.isEmpty() ) {
for( final Haplotype h : haplotypes ) { for( final Haplotype h : haplotypes ) {
final HashMap<Integer,VariantContext> eventMap = h.getEventMap(); final HashMap<Integer,VariantContext> eventMap = h.getEventMap();
final VariantContext vc = eventMap.get(loc); final VariantContext vc = eventMap.get(loc);
if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) {
eventsAtThisLoc.add(vc); eventsAtThisLoc.add(vc);
priorityList.add(vc.getSource());
} }
} }
} else { // we are in GGA mode! } else { // we are in GGA mode!
@ -260,11 +263,27 @@ public class GenotypingEngine {
// Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event
final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); final ArrayList<ArrayList<Haplotype>> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes );
// Sanity check the priority list
for( final VariantContext vc : eventsAtThisLoc ) {
if( !priorityList.contains(vc.getSource()) ) {
throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles.");
}
}
for( final String name : priorityList ) {
boolean found = false;
for( final VariantContext vc : eventsAtThisLoc ) {
if(vc.getSource().equals(name)) { found = true; break; }
}
if( !found ) {
throw new ReviewedStingException("Event added to priority list but wasn't found on any haplotype. Something went wrong in the merging of alleles.");
}
}
// Merge the event to find a common reference representation // Merge the event to find a common reference representation
final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false);
if( mergedVC == null ) { continue; } if( mergedVC == null ) { continue; }
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>(); HashMap<Allele, ArrayList<Haplotype>> alleleHashMap = new HashMap<Allele, ArrayList<Haplotype>>();
int aCount = 0; int aCount = 0;
for( final Allele a : mergedVC.getAlleles() ) { for( final Allele a : mergedVC.getAlleles() ) {
alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper
@ -289,9 +308,20 @@ public class GenotypingEngine {
} }
genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() ); genotypes.add( new GenotypeBuilder(sample).alleles(noCall).PL(genotypeLikelihoods).make() );
} }
final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); VariantContext call = UG_engine.calculateGenotypes(new VariantContextBuilder(mergedVC).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel);
if( call != null ) { if( call != null ) {
if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary!
final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call);
// also, need to update the allele -> haplotype mapping
final HashMap<Allele, ArrayList<Haplotype>> alleleHashMapTrim = new HashMap<Allele, ArrayList<Haplotype>>();
for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC
alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii)));
}
call = vcCallTrim;
alleleHashMap = alleleHashMapTrim;
}
returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) ); returnCalls.add( new Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>>(call, alleleHashMap) );
} }
} }
@ -299,9 +329,8 @@ public class GenotypingEngine {
return returnCalls; return returnCalls;
} }
protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes, final ArrayList<String> priorityList ) { protected static void cleanUpSymbolicUnassembledEvents( final ArrayList<Haplotype> haplotypes ) {
final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>(); final ArrayList<Haplotype> haplotypesToRemove = new ArrayList<Haplotype>();
final ArrayList<String> stringsToRemove = new ArrayList<String>();
for( final Haplotype h : haplotypes ) { for( final Haplotype h : haplotypes ) {
for( final VariantContext vc : h.getEventMap().values() ) { for( final VariantContext vc : h.getEventMap().values() ) {
if( vc.isSymbolic() ) { if( vc.isSymbolic() ) {
@ -309,7 +338,6 @@ public class GenotypingEngine {
for( final VariantContext vc2 : h2.getEventMap().values() ) { for( final VariantContext vc2 : h2.getEventMap().values() ) {
if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) { if( vc.getStart() == vc2.getStart() && vc2.isIndel() ) {
haplotypesToRemove.add(h); haplotypesToRemove.add(h);
stringsToRemove.add(vc.getSource());
break; break;
} }
} }
@ -318,7 +346,6 @@ public class GenotypingEngine {
} }
} }
haplotypes.removeAll(haplotypesToRemove); haplotypes.removeAll(haplotypesToRemove);
priorityList.removeAll(stringsToRemove);
} }
protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { protected void mergeConsecutiveEventsBasedOnLD( final ArrayList<Haplotype> haplotypes, final TreeSet<Integer> startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) {

View File

@ -27,29 +27,23 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection; import org.broadinstitute.sting.gatk.arguments.DbsnpArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.filters.BadMateFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.PartitionBy;
import org.broadinstitute.sting.gatk.walkers.PartitionType;
import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine;
import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.*;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult;
import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.clipping.ReadClipper;
import org.broadinstitute.sting.utils.codecs.vcf.*; import org.broadinstitute.sting.utils.codecs.vcf.*;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
@ -57,6 +51,9 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile;
import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentCollection;
import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.fragments.FragmentUtils;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.pairhmm.PairHMM;
import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.PileupElement;
import org.broadinstitute.sting.utils.sam.AlignmentUtils; import org.broadinstitute.sting.utils.sam.AlignmentUtils;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
@ -106,6 +103,7 @@ import java.util.*;
@DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} )
@PartitionBy(PartitionType.LOCUS) @PartitionBy(PartitionType.LOCUS)
@BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN)
@ActiveRegionExtension(extension=65, maxRegion=300) @ActiveRegionExtension(extension=65, maxRegion=300)
public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatible { public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implements AnnotatorCompatible {
@ -118,6 +116,12 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
@Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false) @Output(fullName="graphOutput", shortName="graph", doc="File to which debug assembly graph information should be written", required = false)
protected PrintStream graphWriter = null; protected PrintStream graphWriter = null;
/**
* The PairHMM implementation to use for genotype likelihood calculations. The various implementations balance a tradeoff of accuracy and runtime.
*/
@Argument(fullName = "pair_hmm_implementation", shortName = "pairHMM", doc = "The PairHMM implementation to use for genotype likelihood calculations", required = false)
public PairHMM.HMM_IMPLEMENTATION pairHMM = PairHMM.HMM_IMPLEMENTATION.LOGLESS_CACHING;
@Hidden @Hidden
@Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false) @Argument(fullName="keepRG", shortName="keepRG", doc="Only use read from this read group when making calls (but use all reads to build the assembly)", required = false)
protected String keepRG = null; protected String keepRG = null;
@ -177,7 +181,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
* so annotations will be excluded even if they are explicitly included with the other options. * so annotations will be excluded even if they are explicitly included with the other options.
*/ */
@Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false) @Argument(fullName="excludeAnnotation", shortName="XA", doc="One or more specific annotations to exclude", required=false)
protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"HaplotypeScore", "MappingQualityZero", "SpanningDeletions", "TandemRepeatAnnotator"})); protected List<String> annotationsToExclude = new ArrayList<String>(Arrays.asList(new String[]{"SpanningDeletions", "TandemRepeatAnnotator"}));
/** /**
* Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups. * Which groups of annotations to add to the output VCF file. See the VariantAnnotator -list argument to view available groups.
@ -238,12 +242,16 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
samplesList.addAll( samples ); samplesList.addAll( samples );
// initialize the UnifiedGenotyper Engine which is used to call into the exact model // initialize the UnifiedGenotyper Engine which is used to call into the exact model
final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user final UnifiedArgumentCollection UAC = new UnifiedArgumentCollection( SCAC ); // this adapter is used so that the full set of unused UG arguments aren't exposed to the HC user
UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC.clone(), logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); UG_engine = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
UAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
UAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested
UAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING); UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC);
UAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), UAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling
simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING );
simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING );
simpleUAC.exactCallsLog = null;
UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY);
// initialize the output VCF header // initialize the output VCF header
annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit()); annotationEngine = new VariantAnnotatorEngine(Arrays.asList(annotationClassesToUse), annotationsToUse, annotationsToExclude, this, getToolkit());
@ -287,7 +295,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
} }
assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter );
likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, false ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM );
genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE );
} }
@ -312,7 +320,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) {
for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) { for( final VariantContext vc : tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()) ) {
if( !allelesToGenotype.contains(vc) ) { if( !allelesToGenotype.contains(vc) ) {
allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a ReadMetaDataTracker object allelesToGenotype.add(vc); // save for later for processing during the ActiveRegion's map call. Should be folded into a RefMetaDataTracker object
} }
} }
if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) {
@ -400,6 +408,9 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria final List<GATKSAMRecord> filteredReads = filterNonPassingReads( activeRegion ); // filter out reads from genotyping which fail mapping quality based criteria
if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do! if( activeRegion.size() == 0 ) { return 1; } // no reads remain after filtering so nothing else to do!
// sort haplotypes to take full advantage of haplotype start offset optimizations in PairHMM
Collections.sort( haplotypes, new Haplotype.HaplotypeBaseComparator() );
// evaluate each sample's reads against all haplotypes // evaluate each sample's reads against all haplotypes
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() ); final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList = splitReadsBySample( activeRegion.getReads() );
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads ); final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList = splitReadsBySample( filteredReads );
@ -414,7 +425,7 @@ public class HaplotypeCaller extends ActiveRegionWalker<Integer, Integer> implem
: genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) {
if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); }
final Map<String, Map<Allele, List<GATKSAMRecord>>> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult ); final Map<String, PerReadAlleleLikelihoodMap> stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog );
final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst());
final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes()); final Map<String, Object> myAttributes = new LinkedHashMap<String, Object>(annotatedCall.getAttributes());

View File

@ -4,6 +4,7 @@ import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.jgrapht.graph.DefaultDirectedGraph; import org.jgrapht.graph.DefaultDirectedGraph;
import java.io.Serializable;
import java.util.*; import java.util.*;
/** /**
@ -76,13 +77,15 @@ public class KBestPaths {
} }
} }
protected static class PathComparatorTotalScore implements Comparator<Path> { protected static class PathComparatorTotalScore implements Comparator<Path>, Serializable {
@Override
public int compare(final Path path1, final Path path2) { public int compare(final Path path1, final Path path2) {
return path1.totalScore - path2.totalScore; return path1.totalScore - path2.totalScore;
} }
} }
//protected static class PathComparatorLowestEdge implements Comparator<Path> { //protected static class PathComparatorLowestEdge implements Comparator<Path>, Serializable {
// @Override
// public int compare(final Path path1, final Path path2) { // public int compare(final Path path1, final Path path2) {
// return path2.lowestEdge - path1.lowestEdge; // return path2.lowestEdge - path1.lowestEdge;
// } // }
@ -124,7 +127,7 @@ public class KBestPaths {
// recursively run DFS // recursively run DFS
final ArrayList<DeBruijnEdge> edgeArrayList = new ArrayList<DeBruijnEdge>(); final ArrayList<DeBruijnEdge> edgeArrayList = new ArrayList<DeBruijnEdge>();
edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex)); edgeArrayList.addAll(graph.outgoingEdgesOf(path.lastVertex));
Collections.sort(edgeArrayList); Collections.sort(edgeArrayList, new DeBruijnEdge.EdgeWeightComparator());
Collections.reverse(edgeArrayList); Collections.reverse(edgeArrayList);
for ( final DeBruijnEdge edge : edgeArrayList ) { for ( final DeBruijnEdge edge : edgeArrayList ) {
// make sure the edge is not already in the path // make sure the edge is not already in the path

View File

@ -27,25 +27,46 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller;
import com.google.java.contract.Ensures; import com.google.java.contract.Ensures;
import com.google.java.contract.Requires; import com.google.java.contract.Requires;
import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.pairhmm.*;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.sam.ReadUtils;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.PrintStream;
import java.util.*; import java.util.*;
public class LikelihoodCalculationEngine { public class LikelihoodCalculationEngine {
private static final double LOG_ONE_HALF = -Math.log10(2.0); private static final double LOG_ONE_HALF = -Math.log10(2.0);
private static final double BEST_LIKELIHOOD_THRESHOLD = 0.1;
private final byte constantGCP; private final byte constantGCP;
private final boolean DEBUG; private final boolean DEBUG;
private final PairHMM pairHMM; private final PairHMM pairHMM;
public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final boolean noBanded ) { public LikelihoodCalculationEngine( final byte constantGCP, final boolean debug, final PairHMM.HMM_IMPLEMENTATION hmmType ) {
pairHMM = new PairHMM( noBanded );
switch (hmmType) {
case EXACT:
pairHMM = new ExactPairHMM();
break;
case ORIGINAL:
pairHMM = new OriginalPairHMM();
break;
case CACHING:
pairHMM = new CachingPairHMM();
break;
case LOGLESS_CACHING:
pairHMM = new LoglessCachingPairHMM();
break;
default:
throw new UserException.BadArgumentValue("pairHMM", "Specified pairHMM implementation is unrecognized or incompatible with the HaplotypeCaller. Acceptable options are ORIGINAL, EXACT, CACHING, and LOGLESS_CACHING.");
}
this.constantGCP = constantGCP; this.constantGCP = constantGCP;
DEBUG = debug; DEBUG = debug;
} }
@ -69,23 +90,18 @@ public class LikelihoodCalculationEngine {
X_METRIC_LENGTH += 2; X_METRIC_LENGTH += 2;
Y_METRIC_LENGTH += 2; Y_METRIC_LENGTH += 2;
// initial arrays to hold the probabilities of being in the match, insertion and deletion cases // initialize arrays to hold the probabilities of being in the match, insertion and deletion cases
final double[][] matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH]; pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH);
final double[][] XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
final double[][] YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
PairHMM.initializeArrays(matchMetricArray, XMetricArray, YMetricArray, X_METRIC_LENGTH);
// for each sample's reads // for each sample's reads
for( final String sample : perSampleReadList.keySet() ) { for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sampleEntry : perSampleReadList.entrySet() ) {
//if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); }
// evaluate the likelihood of the reads given those haplotypes // evaluate the likelihood of the reads given those haplotypes
computeReadLikelihoods( haplotypes, perSampleReadList.get(sample), sample, matchMetricArray, XMetricArray, YMetricArray ); computeReadLikelihoods( haplotypes, sampleEntry.getValue(), sampleEntry.getKey() );
} }
} }
private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample, private void computeReadLikelihoods( final ArrayList<Haplotype> haplotypes, final ArrayList<GATKSAMRecord> reads, final String sample ) {
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
final int numHaplotypes = haplotypes.size(); final int numHaplotypes = haplotypes.size();
final int numReads = reads.size(); final int numReads = reads.size();
@ -113,9 +129,8 @@ public class LikelihoodCalculationEngine {
final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) ); final int haplotypeStart = ( previousHaplotypeSeen == null ? 0 : computeFirstDifferingPosition(haplotype.getBases(), previousHaplotypeSeen.getBases()) );
previousHaplotypeSeen = haplotype; previousHaplotypeSeen = haplotype;
readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotype(haplotype.getBases(), read.getReadBases(), readLikelihoods[jjj][iii] = pairHMM.computeReadLikelihoodGivenHaplotypeLog10(haplotype.getBases(), read.getReadBases(),
readQuals, readInsQuals, readDelQuals, overallGCP, readQuals, readInsQuals, readDelQuals, overallGCP, haplotypeStart, jjj == 0);
haplotypeStart, matchMetricArray, XMetricArray, YMetricArray);
readCounts[jjj][iii] = readCount; readCounts[jjj][iii] = readCount;
} }
} }
@ -125,12 +140,12 @@ public class LikelihoodCalculationEngine {
} }
private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) { private static int computeFirstDifferingPosition( final byte[] b1, final byte[] b2 ) {
for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ){ for( int iii = 0; iii < b1.length && iii < b2.length; iii++ ) {
if( b1[iii] != b2[iii] ) { if( b1[iii] != b2[iii] ) {
return iii; return iii;
} }
} }
return b1.length; return Math.min(b1.length, b2.length);
} }
@Requires({"haplotypes.size() > 0"}) @Requires({"haplotypes.size() > 0"})
@ -183,7 +198,7 @@ public class LikelihoodCalculationEngine {
haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF );
} }
} }
haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // MathUtils.approximateLog10SumLog10(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); // BUGBUG: max or sum? haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood);
} }
} }
} }
@ -280,7 +295,7 @@ public class LikelihoodCalculationEngine {
final int numHaplotypes = haplotypes.size(); final int numHaplotypes = haplotypes.size();
final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final Set<String> sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples
final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>(); final ArrayList<Integer> bestHaplotypesIndexList = new ArrayList<Integer>();
bestHaplotypesIndexList.add(0); // always start with the reference haplotype bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype
// set up the default 1-to-1 haplotype mapping object // set up the default 1-to-1 haplotype mapping object
final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>(); final ArrayList<ArrayList<Haplotype>> haplotypeMapping = new ArrayList<ArrayList<Haplotype>>();
for( final Haplotype h : haplotypes ) { for( final Haplotype h : haplotypes ) {
@ -322,19 +337,30 @@ public class LikelihoodCalculationEngine {
return bestHaplotypes; return bestHaplotypes;
} }
public static Map<String, Map<Allele, List<GATKSAMRecord>>> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList, final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList, final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call) { public static int findReferenceIndex( final List<Haplotype> haplotypes ) {
final Map<String, Map<Allele, List<GATKSAMRecord>>> returnMap = new HashMap<String, Map<Allele, List<GATKSAMRecord>>>(); for( final Haplotype h : haplotypes ) {
if( h.isReference() ) { return haplotypes.indexOf(h); }
}
throw new ReviewedStingException( "No reference haplotype found in the list of haplotypes!" );
}
public static Map<String, PerReadAlleleLikelihoodMap> partitionReadsBasedOnLikelihoods( final GenomeLocParser parser,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleReadList,
final HashMap<String, ArrayList<GATKSAMRecord>> perSampleFilteredReadList,
final Pair<VariantContext, HashMap<Allele,ArrayList<Haplotype>>> call,
final double downsamplingFraction,
final PrintStream downsamplingLog ) {
final Map<String, PerReadAlleleLikelihoodMap> returnMap = new HashMap<String, PerReadAlleleLikelihoodMap>();
final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst()); final GenomeLoc callLoc = parser.createGenomeLoc(call.getFirst());
for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) { for( final Map.Entry<String, ArrayList<GATKSAMRecord>> sample : perSampleReadList.entrySet() ) {
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>(); final PerReadAlleleLikelihoodMap likelihoodMap = PerReadAlleleLikelihoodMap.getBestAvailablePerReadAlleleLikelihoodMap();
final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue(); final ArrayList<GATKSAMRecord> readsForThisSample = sample.getValue();
for( int iii = 0; iii < readsForThisSample.size(); iii++ ) { for( int iii = 0; iii < readsForThisSample.size(); iii++ ) {
final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same! final GATKSAMRecord read = readsForThisSample.get(iii); // BUGBUG: assumes read order in this list and haplotype likelihood list are the same!
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all // only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
final double likelihoods[] = new double[call.getFirst().getAlleles().size()]; for( final Allele a : call.getFirst().getAlleles() ) {
int count = 0;
for( final Allele a : call.getFirst().getAlleles() ) { // find the allele with the highest haplotype likelihood
double maxLikelihood = Double.NEGATIVE_INFINITY; double maxLikelihood = Double.NEGATIVE_INFINITY;
for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object) for( final Haplotype h : call.getSecond().get(a) ) { // use the max likelihood from all the haplotypes which mapped to this allele (achieved via the haplotype mapper object)
final double likelihood = h.getReadLikelihoods(sample.getKey())[iii]; final double likelihood = h.getReadLikelihoods(sample.getKey())[iii];
@ -342,43 +368,26 @@ public class LikelihoodCalculationEngine {
maxLikelihood = likelihood; maxLikelihood = likelihood;
} }
} }
likelihoods[count++] = maxLikelihood; likelihoodMap.add(read, a, maxLikelihood);
} }
final int bestAllele = MathUtils.maxElementIndex(likelihoods);
final double bestLikelihood = likelihoods[bestAllele];
Allele allele = Allele.NO_CALL;
boolean isInformativeRead = false;
for( final double likelihood : likelihoods ) {
if( bestLikelihood - likelihood > BEST_LIKELIHOOD_THRESHOLD ) {
isInformativeRead = true;
break;
}
}
// uninformative reads get the no call Allele
if( isInformativeRead ) {
allele = call.getFirst().getAlleles().get(bestAllele);
}
List<GATKSAMRecord> readList = alleleReadMap.get(allele);
if( readList == null ) {
readList = new ArrayList<GATKSAMRecord>();
alleleReadMap.put(allele, readList);
}
readList.add(read);
} }
} }
// down-sample before adding filtered reads
likelihoodMap.performPerAlleleDownsampling(downsamplingFraction, downsamplingLog);
// add all filtered reads to the NO_CALL list because they weren't given any likelihoods // add all filtered reads to the NO_CALL list because they weren't given any likelihoods
List<GATKSAMRecord> readList = alleleReadMap.get(Allele.NO_CALL);
if( readList == null ) {
readList = new ArrayList<GATKSAMRecord>();
alleleReadMap.put(Allele.NO_CALL, readList);
}
for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) { for( final GATKSAMRecord read : perSampleFilteredReadList.get(sample.getKey()) ) {
// only count the read if it overlaps the event, otherwise it is not added to the output read list at all // only count the read if it overlaps the event, otherwise it is not added to the output read list at all
if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) { if( callLoc.overlapsP(parser.createGenomeLoc(read)) ) {
readList.add(read); for( final Allele a : call.getFirst().getAlleles() ) {
likelihoodMap.add(read, a, 0.0);
}
} }
} }
returnMap.put(sample.getKey(), alleleReadMap);
returnMap.put(sample.getKey(), likelihoodMap);
} }
return returnMap; return returnMap;
} }

View File

@ -184,7 +184,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
for( final GATKSAMRecord read : reads ) { for( final GATKSAMRecord read : reads ) {
final byte[] sequence = read.getReadBases(); final byte[] sequence = read.getReadBases();
final byte[] qualities = read.getBaseQualities(); final byte[] qualities = read.getBaseQualities();
final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not readuced final byte[] reducedReadCounts = read.getReducedReadCounts(); // will be null if read is not reduced
if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) { if( sequence.length > KMER_LENGTH + KMER_OVERLAP ) {
final int kmersInSequence = sequence.length - KMER_LENGTH + 1; final int kmersInSequence = sequence.length - KMER_LENGTH + 1;
for( int iii = 0; iii < kmersInSequence - 1; iii++ ) { for( int iii = 0; iii < kmersInSequence - 1; iii++ ) {
@ -201,7 +201,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine {
// compute mean number of reduced read counts in current kmer span // compute mean number of reduced read counts in current kmer span
final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1); final byte[] counts = Arrays.copyOfRange(reducedReadCounts,iii,iii+KMER_LENGTH+1);
// precise rounding can make a difference with low consensus counts // precise rounding can make a difference with low consensus counts
countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length); countNumber = MathUtils.arrayMax(counts);
// countNumber = (int)Math.round((double)MathUtils.sum(counts)/counts.length);
} }
if( !badKmer ) { if( !badKmer ) {

View File

@ -0,0 +1,71 @@
/*
* Copyright (c) 2011 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.genotyper;
import org.broadinstitute.sting.gatk.downsampling.AlleleBiasedDownsamplingUtils;
import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import java.io.PrintStream;
import java.util.*;
public class AdvancedPerReadAlleleLikelihoodMap extends StandardPerReadAlleleLikelihoodMap implements ProtectedPackageSource {
public ReadBackedPileup createPerAlleleDownsampledBasePileup(final ReadBackedPileup pileup, final double downsamplingFraction, final PrintStream log) {
return AlleleBiasedDownsamplingUtils.createAlleleBiasedBasePileup(pileup, downsamplingFraction, log);
}
public void performPerAlleleDownsampling(final double downsamplingFraction, final PrintStream log) {
// special case removal of all or no reads
if ( downsamplingFraction <= 0.0 )
return;
if ( downsamplingFraction >= 1.0 ) {
likelihoodReadMap.clear();
return;
}
// start by stratifying the reads by the alleles they represent at this position
final Map<Allele, List<GATKSAMRecord>> alleleReadMap = new HashMap<Allele, List<GATKSAMRecord>>(alleles.size());
for ( Allele allele : alleles )
alleleReadMap.put(allele, new ArrayList<GATKSAMRecord>());
for ( Map.Entry<GATKSAMRecord, Map<Allele, Double>> entry : likelihoodReadMap.entrySet() ) {
// do not remove reduced reads!
if ( !entry.getKey().isReducedRead() ) {
final Allele bestAllele = getMostLikelyAllele(entry.getValue());
if ( bestAllele != Allele.NO_CALL )
alleleReadMap.get(bestAllele).add(entry.getKey());
}
}
// compute the reads to remove and actually remove them
final List<GATKSAMRecord> readsToRemove = AlleleBiasedDownsamplingUtils.selectAlleleBiasedReads(alleleReadMap, downsamplingFraction, log);
for ( final GATKSAMRecord read : readsToRemove )
likelihoodReadMap.remove(read);
}
}

View File

@ -0,0 +1,181 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: rpoplin, carneiro
* Date: 10/16/12
*/
public class CachingPairHMM extends OriginalPairHMM {
double[][] constantMatrix = null; // The cache in the CachingPairHMM
double[][] distanceMatrix = null; // The cache in the CachingPairHMM
protected static final double [] firstRowConstantMatrix = {
QualityUtils.qualToProbLog10((byte) (DEFAULT_GOP + DEFAULT_GOP)),
QualityUtils.qualToProbLog10(DEFAULT_GCP),
QualityUtils.qualToErrorProbLog10(DEFAULT_GOP),
QualityUtils.qualToErrorProbLog10(DEFAULT_GCP),
0.0,
0.0
};
@Override
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
super.initialize(READ_MAX_LENGTH, HAPLOTYPE_MAX_LENGTH);
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
constantMatrix = new double[X_METRIC_LENGTH][6];
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
// fill in the first row
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
updateCell(1, jjj, 0.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
}
}
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
if( recacheReadValues ) {
initializeConstants( insertionGOP, deletionGOP, overallGCP );
}
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
for (int i = 2; i < X_METRIC_LENGTH; i++) {
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return MathUtils.approximateLog10SumLog10(matchMetricArray[endI][endJ], XMetricArray[endI][endJ], YMetricArray[endI][endJ]);
}
/**
* Initializes the matrix that holds all the constants related to the editing
* distance between the read and the haplotype.
*
* @param haplotypeBases the bases of the haplotype
* @param readBases the bases of the read
* @param readQuals the base quality scores of the read
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
*/
public void initializeDistanceMatrix( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final int startIndex ) {
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
for (int i = 0; i < readBases.length; i++) {
final byte x = readBases[i];
final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProbLog10(qual) : QualityUtils.qualToErrorProbLog10(qual) );
}
}
}
/**
* Initializes the matrix that holds all the constants related to quality scores.
*
* @param insertionGOP insertion quality scores of the read
* @param deletionGOP deletion quality scores of the read
* @param overallGCP overall gap continuation penalty
*/
public void initializeConstants( final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP ) {
final int l = insertionGOP.length;
constantMatrix[1] = firstRowConstantMatrix;
for (int i = 0; i < l; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
constantMatrix[i+2][0] = QualityUtils.qualToProbLog10((byte) qualIndexGOP);
constantMatrix[i+2][1] = QualityUtils.qualToProbLog10(overallGCP[i]);
constantMatrix[i+2][2] = QualityUtils.qualToErrorProbLog10(insertionGOP[i]);
constantMatrix[i+2][3] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
constantMatrix[i+2][4] = QualityUtils.qualToErrorProbLog10(deletionGOP[i]);
constantMatrix[i+2][5] = QualityUtils.qualToErrorProbLog10(overallGCP[i]);
}
constantMatrix[l+1][4] = 0.0;
constantMatrix[l+1][5] = 0.0;
}
/**
* Updates a cell in the HMM matrix
*
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
* initial conditions
* @param indI row index in the matrices to update
* @param indJ column index in the matrices to update
* @param prior the likelihood editing distance matrix for the read x haplotype
* @param constants an array with the six constants relevant to this location
* @param matchMetricArray the matches likelihood matrix
* @param XMetricArray the insertions likelihood matrix
* @param YMetricArray the deletions likelihood matrix
*/
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
matchMetricArray[indI][indJ] = prior +
MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ - 1] + constants[0],
XMetricArray[indI - 1][indJ - 1] + constants[1],
YMetricArray[indI - 1][indJ - 1] + constants[1] );
XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI - 1][indJ] + constants[2],
XMetricArray[indI - 1][indJ] + constants[3]);
YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10( matchMetricArray[indI][indJ - 1] + constants[4],
YMetricArray[indI][indJ - 1] + constants[5]);
}
}

View File

@ -0,0 +1,187 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.utils.pairhmm;
import org.broadinstitute.sting.utils.QualityUtils;
import java.util.Arrays;
/**
* Created with IntelliJ IDEA.
* User: rpoplin, carneiro
* Date: 10/16/12
*/
public class LoglessCachingPairHMM extends CachingPairHMM {
protected static final double SCALE_FACTOR_LOG10 = 300.0;
protected static final double [] firstRowConstantMatrix = {
QualityUtils.qualToProb((byte) (DEFAULT_GOP + DEFAULT_GOP)),
QualityUtils.qualToProb(DEFAULT_GCP),
QualityUtils.qualToErrorProb(DEFAULT_GOP),
QualityUtils.qualToErrorProb(DEFAULT_GCP),
1.0,
1.0
};
@Override
public void initialize( final int READ_MAX_LENGTH, final int HAPLOTYPE_MAX_LENGTH ) {
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = READ_MAX_LENGTH + 2;
final int Y_METRIC_LENGTH = HAPLOTYPE_MAX_LENGTH + 2;
matchMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
XMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
YMetricArray = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
for( int iii=0; iii < X_METRIC_LENGTH; iii++ ) {
Arrays.fill(matchMetricArray[iii], 0.0);
Arrays.fill(XMetricArray[iii], 0.0);
Arrays.fill(YMetricArray[iii], 0.0);
}
// the initial condition
matchMetricArray[1][1] = Math.pow(10.0, SCALE_FACTOR_LOG10); // Math.log10(1.0);
constantMatrix = new double[X_METRIC_LENGTH][6];
distanceMatrix = new double[X_METRIC_LENGTH][Y_METRIC_LENGTH];
// fill in the first row
for( int jjj = 2; jjj < Y_METRIC_LENGTH; jjj++ ) {
updateCell(1, jjj, 1.0, firstRowConstantMatrix, matchMetricArray, XMetricArray, YMetricArray);
}
}
@Override
public double computeReadLikelihoodGivenHaplotypeLog10( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP,
final int hapStartIndex,
final boolean recacheReadValues ) {
if( recacheReadValues ) {
initializeConstants( insertionGOP, deletionGOP, overallGCP );
}
initializeDistanceMatrix( haplotypeBases, readBases, readQuals, hapStartIndex );
// M, X, and Y arrays are of size read and haplotype + 1 because of an extra column for initial conditions and + 1 to consider the final base in a non-global alignment
final int X_METRIC_LENGTH = readBases.length + 2;
final int Y_METRIC_LENGTH = haplotypeBases.length + 2;
for (int i = 2; i < X_METRIC_LENGTH; i++) {
for (int j = hapStartIndex+1; j < Y_METRIC_LENGTH; j++) {
updateCell(i, j, distanceMatrix[i][j], constantMatrix[i], matchMetricArray, XMetricArray, YMetricArray);
}
}
// final probability is the log10 sum of the last element in all three state arrays
final int endI = X_METRIC_LENGTH - 1;
final int endJ = Y_METRIC_LENGTH - 1;
return Math.log10( matchMetricArray[endI][endJ] + XMetricArray[endI][endJ] + YMetricArray[endI][endJ] ) - SCALE_FACTOR_LOG10;
}
/**
* Initializes the matrix that holds all the constants related to the editing
* distance between the read and the haplotype.
*
* @param haplotypeBases the bases of the haplotype
* @param readBases the bases of the read
* @param readQuals the base quality scores of the read
* @param startIndex where to start updating the distanceMatrix (in case this read is similar to the previous read)
*/
public void initializeDistanceMatrix( final byte[] haplotypeBases,
final byte[] readBases,
final byte[] readQuals,
final int startIndex ) {
// initialize the pBaseReadLog10 matrix for all combinations of read x haplotype bases
// Abusing the fact that java initializes arrays with 0.0, so no need to fill in rows and columns below 2.
for (int i = 0; i < readBases.length; i++) {
final byte x = readBases[i];
final byte qual = readQuals[i];
for (int j = startIndex; j < haplotypeBases.length; j++) {
final byte y = haplotypeBases[j];
distanceMatrix[i+2][j+2] = ( x == y || x == (byte) 'N' || y == (byte) 'N' ?
QualityUtils.qualToProb(qual) : QualityUtils.qualToErrorProb(qual) );
}
}
}
/**
* Initializes the matrix that holds all the constants related to quality scores.
*
* @param insertionGOP insertion quality scores of the read
* @param deletionGOP deletion quality scores of the read
* @param overallGCP overall gap continuation penalty
*/
public void initializeConstants( final byte[] insertionGOP,
final byte[] deletionGOP,
final byte[] overallGCP ) {
final int l = insertionGOP.length;
constantMatrix[1] = firstRowConstantMatrix;
for (int i = 0; i < l; i++) {
final int qualIndexGOP = Math.min(insertionGOP[i] + deletionGOP[i], Byte.MAX_VALUE);
constantMatrix[i+2][0] = QualityUtils.qualToProb((byte) qualIndexGOP);
constantMatrix[i+2][1] = QualityUtils.qualToProb(overallGCP[i]);
constantMatrix[i+2][2] = QualityUtils.qualToErrorProb(insertionGOP[i]);
constantMatrix[i+2][3] = QualityUtils.qualToErrorProb(overallGCP[i]);
constantMatrix[i+2][4] = QualityUtils.qualToErrorProb(deletionGOP[i]);
constantMatrix[i+2][5] = QualityUtils.qualToErrorProb(overallGCP[i]);
}
constantMatrix[l+1][4] = 1.0;
constantMatrix[l+1][5] = 1.0;
}
/**
* Updates a cell in the HMM matrix
*
* The read and haplotype indices are offset by one because the state arrays have an extra column to hold the
* initial conditions
* @param indI row index in the matrices to update
* @param indJ column index in the matrices to update
* @param prior the likelihood editing distance matrix for the read x haplotype
* @param constants an array with the six constants relevant to this location
* @param matchMetricArray the matches likelihood matrix
* @param XMetricArray the insertions likelihood matrix
* @param YMetricArray the deletions likelihood matrix
*/
private void updateCell( final int indI, final int indJ, final double prior, final double[] constants,
final double[][] matchMetricArray, final double[][] XMetricArray, final double[][] YMetricArray ) {
matchMetricArray[indI][indJ] = prior * ( matchMetricArray[indI - 1][indJ - 1] * constants[0] +
XMetricArray[indI - 1][indJ - 1] * constants[1] +
YMetricArray[indI - 1][indJ - 1] * constants[1] );
XMetricArray[indI][indJ] = matchMetricArray[indI - 1][indJ] * constants[2] + XMetricArray[indI - 1][indJ] * constants[3];
YMetricArray[indI][indJ] = matchMetricArray[indI][indJ - 1] * constants[4] + YMetricArray[indI][indJ - 1] * constants[5];
}
}

View File

@ -5,7 +5,9 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
import org.testng.annotations.DataProvider; import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List;
/** /**
* @author ebanks * @author ebanks
@ -34,7 +36,6 @@ public class BQSRIntegrationTest extends WalkerTest {
" -I " + bam + " -I " + bam +
" -L " + interval + " -L " + interval +
args + args +
" --no_plots" +
" -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) + " -knownSites " + (reference.equals(b36KGReference) ? b36dbSNP129 : hg18dbSNP132) +
" -o %s"; " -o %s";
} }
@ -50,21 +51,21 @@ public class BQSRIntegrationTest extends WalkerTest {
String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam";
String HiSeqInterval = "chr1:10,000,000-10,100,000"; String HiSeqInterval = "chr1:10,000,000-10,100,000";
return new Object[][]{ return new Object[][]{
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "1cfc73371abb933ca26496745d105ff0")}, {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "ee5142776008741b1b2453b1258c6d99")}, {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "fbc520794f0f98d52159de956f7217f1")}, {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab5b93794049c514bf8e407019d76b67")}, {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "81df636e3d0ed6f16113517e0169bc96")}, {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "ad3c47355448f8c45e172c6e1129c65d")}, {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")},
{new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "fef7240140a9b6d6335ce009fa4edec5")}, {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "600652ee49b9ce1ca2d8ee2d8b7c8211")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "769f95b9dcc78a405d3e6b191e5a19f5")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "43fcba51264cc98bd8466d21e1b96766")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")},
{new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "48aaf9ac54b97eac3663882a59354ab2")}, {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "dac04b9e1e1c52af8d3a50c2e550fda9")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")},
{new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "90d70542076715a8605a8d4002614b34")}, {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "600652ee49b9ce1ca2d8ee2d8b7c8211")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "f70d8b5358bc2f76696f14b7a807ede0")},
{new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "26a04f5a28c40750c603cbe8a926d7bd")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "5e657fd6a44dcdc7674b6e5a2de5dc83")},
}; };
} }
@ -74,12 +75,6 @@ public class BQSRIntegrationTest extends WalkerTest {
params.getCommandLine(), params.getCommandLine(),
Arrays.asList(params.md5)); Arrays.asList(params.md5));
executeTest("testBQSR-"+params.args, spec).getFirst(); executeTest("testBQSR-"+params.args, spec).getFirst();
// TODO -- re-enable once parallelization is fixed in BaseRecalibrator
//WalkerTestSpec specNT2 = new WalkerTestSpec(
// params.getCommandLine() + " -nt 2",
// Arrays.asList(params.md5));
//executeTest("testBQSR-nt2-"+params.args, specNT2).getFirst();
} }
@Test @Test
@ -89,7 +84,6 @@ public class BQSRIntegrationTest extends WalkerTest {
" -R " + b36KGReference + " -R " + b36KGReference +
" -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" + " -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" +
" -L 1:10,000,000-10,200,000" + " -L 1:10,000,000-10,200,000" +
" --no_plots" +
" -o %s", " -o %s",
1, // just one output file 1, // just one output file
UserException.CommandLineException.class); UserException.CommandLineException.class);
@ -103,7 +97,6 @@ public class BQSRIntegrationTest extends WalkerTest {
" -R " + b36KGReference + " -R " + b36KGReference +
" -I " + privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam" + " -I " + privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam" +
" -L 1:50,000-80,000" + " -L 1:50,000-80,000" +
" --no_plots" +
" -o %s", " -o %s",
1, // just one output file 1, // just one output file
UserException.class); UserException.class);
@ -127,21 +120,27 @@ public class BQSRIntegrationTest extends WalkerTest {
@DataProvider(name = "PRTest") @DataProvider(name = "PRTest")
public Object[][] createPRTestData() { public Object[][] createPRTestData() {
return new Object[][]{ List<Object[]> tests = new ArrayList<Object[]>();
{new PRTest("", "d2d6ed8667cdba7e56f5db97d6262676")},
{new PRTest(" -qq -1", "b7053d3d67aba6d8892f0a60f0ded338")}, tests.add(new Object[]{1, new PRTest(" -qq -1", "5226c06237b213b9e9b25a32ed92d09a")});
{new PRTest(" -qq 6", "bfbf0855185b2b70aa35237fb71e4487")}, tests.add(new Object[]{1, new PRTest(" -qq 6", "b592a5c62b952a012e18adb898ea9c33")});
{new PRTest(" -DIQ", "66aa65223f192ee39c1773aa187fd493")} tests.add(new Object[]{1, new PRTest(" -DIQ", "8977bea0c57b808e65e9505eb648cdf7")});
};
for ( final int nct : Arrays.asList(1, 2, 4) ) {
tests.add(new Object[]{nct, new PRTest("", "ab2f209ab98ad3432e208cbd524a4c4a")});
}
return tests.toArray(new Object[][]{});
} }
@Test(dataProvider = "PRTest") @Test(dataProvider = "PRTest")
public void testPR(PRTest params) { public void testPR(final int nct, PRTest params) {
WalkerTestSpec spec = new WalkerTestSpec( WalkerTestSpec spec = new WalkerTestSpec(
"-T PrintReads" + "-T PrintReads" +
" -R " + hg18Reference + " -R " + hg18Reference +
" -I " + privateTestDir + "HiSeq.1mb.1RG.bam" + " -I " + privateTestDir + "HiSeq.1mb.1RG.bam" +
" -BQSR " + privateTestDir + "HiSeq.1mb.1RG.table" + " -nct " + nct +
" -BQSR " + privateTestDir + "HiSeq.20mb.1RG.table" +
params.args + params.args +
" -o %s", " -o %s",
Arrays.asList(params.md5)); Arrays.asList(params.md5));

View File

@ -63,7 +63,7 @@ public class BaseCountsUnitTest extends BaseTest {
String name = String.format("Test-%s", params.bases); String name = String.format("Test-%s", params.bases);
Assert.assertEquals(counts.totalCount(), params.bases.length(), name); Assert.assertEquals(counts.totalCount(), params.bases.length(), name);
Assert.assertEquals(counts.countOfMostCommonBase(), params.mostCommonCount, name); Assert.assertEquals(counts.countOfBase(counts.baseIndexWithMostCounts()), params.mostCommonCount, name);
Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name); Assert.assertEquals((char)counts.baseWithMostCounts(), (char)params.mostCountBase, name);
} }
} }

View File

@ -21,33 +21,33 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
executeTest(testName, spec); executeTest(testName, spec);
} }
@Test(enabled = true) @Test(enabled = false)
public void testDefaultCompression() { public void testDefaultCompression() {
RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f"); RRTest("testDefaultCompression ", L, "323dd4deabd7767efa0f2c6e7fa4189f");
} }
@Test(enabled = true) @Test(enabled = false)
public void testMultipleIntervals() { public void testMultipleIntervals() {
String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110"; String intervals = "-L 20:10,100,000-10,100,500 -L 20:10,200,000-10,200,500 -L 20:10,300,000-10,300,500 -L 20:10,400,000-10,500,000 -L 20:10,500,050-10,500,060 -L 20:10,600,000-10,600,015 -L 20:10,700,000-10,700,110";
RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76"); RRTest("testMultipleIntervals ", intervals, "c437fb160547ff271f8eba30e5f3ff76");
} }
@Test(enabled = true) @Test(enabled = false)
public void testHighCompression() { public void testHighCompression() {
RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047"); RRTest("testHighCompression ", " -cs 10 -minvar 0.3 -mindel 0.3 " + L, "3a607bc3ebaf84e9dc44e005c5f8a047");
} }
@Test(enabled = true) @Test(enabled = false)
public void testLowCompression() { public void testLowCompression() {
RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63"); RRTest("testLowCompression ", " -cs 30 -minvar 0.01 -mindel 0.01 -minmap 5 -minqual 5 " + L, "7c9b4a70c2c90b0a995800aa42852e63");
} }
@Test(enabled = true) @Test(enabled = false)
public void testIndelCompression() { public void testIndelCompression() {
RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973"); RRTest("testIndelCompression ", " -cs 50 -L 20:10,100,500-10,100,600 ", "f7b9fa44c10bc4b2247813d2b8dc1973");
} }
@Test(enabled = true) @Test(enabled = false)
public void testFilteredDeletionCompression() { public void testFilteredDeletionCompression() {
String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s "; String base = String.format("-T ReduceReads -npt -R %s -I %s ", REF, DELETION_BAM) + " -o %s ";
executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb"))); executeTest("testFilteredDeletionCompression", new WalkerTestSpec(base, Arrays.asList("891bd6dcda66611f343e8ff25f34aaeb")));
@ -61,7 +61,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
* *
* This bam is simplified to replicate the exact bug with the three provided intervals. * This bam is simplified to replicate the exact bug with the three provided intervals.
*/ */
@Test(enabled = true) @Test(enabled = false)
public void testAddingReadAfterTailingTheStash() { public void testAddingReadAfterTailingTheStash() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s "; String base = String.format("-T ReduceReads %s -npt -R %s -I %s", STASH_L, REF, STASH_BAM) + " -o %s ";
executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6"))); executeTest("testAddingReadAfterTailingTheStash", new WalkerTestSpec(base, Arrays.asList("886b43e1f26ff18425814dc7563931c6")));
@ -71,7 +71,7 @@ public class ReduceReadsIntegrationTest extends WalkerTest {
* Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get * Divide by zero bug reported by GdA and users in the forum. Happens when the downsampler goes over a region where all reads get
* filtered out. * filtered out.
*/ */
@Test(enabled = true) @Test(enabled = false)
public void testDivideByZero() { public void testDivideByZero() {
String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s "; String base = String.format("-T ReduceReads %s -npt -R %s -I %s", DIVIDEBYZERO_L, REF, DIVIDEBYZERO_BAM) + " -o %s ";
executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2"))); executeTest("testDivideByZero", new WalkerTestSpec(base, Arrays.asList("93ffdc209d4cc0fc4f0169ca9be55cc2")));

View File

@ -35,7 +35,7 @@ public void testBaseCounts() {
new TestRead(bases, quals, new Byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})}; new TestRead(bases, quals, new Byte[] {1, 127, 51, 126}, new byte [] {1, 126, 50, 125})};
for (TestRead testRead : testReads) { for (TestRead testRead : testReads) {
SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false); SyntheticRead syntheticRead = new SyntheticRead(Arrays.asList(testRead.getBases()), Arrays.asList(testRead.getCounts()), Arrays.asList(testRead.getQuals()), Arrays.asList(testRead.getInsQuals()), Arrays.asList(testRead.getDelQuals()), artificialMappingQuality, GATKSAMRecord.REDUCED_READ_CONSENSUS_TAG, artificialSAMHeader, artificialGATKRG, artificialContig, artificialContigIndex, artificialReadName, artificialRefStart, false, false);
Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts()); Assert.assertEquals(syntheticRead.convertBaseCounts(), testRead.getExpectedCounts());
} }
} }

View File

@ -1,9 +1,9 @@
package org.broadinstitute.sting.gatk.walkers.genotyper; package org.broadinstitute.sting.gatk.walkers.genotyper;
import org.broadinstitute.sting.WalkerTest; import org.broadinstitute.sting.WalkerTest;
import org.testng.annotations.Test;
import java.util.Arrays; import java.util.Arrays;
import org.testng.annotations.Test;
/** /**
* Created by IntelliJ IDEA. * Created by IntelliJ IDEA.
@ -18,8 +18,9 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam"; final String LSV_BAM = validationDataLocation +"93pools_NA12878_ref_chr20_40m_41m.bam";
final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf"; final String REFSAMPLE_MT_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12878.snp.vcf";
final String REFSAMPLE_NAME = "NA12878"; final String REFSAMPLE_NAME = "NA12878";
final String MTINTERVALS = "MT:1-3000"; final String MTINTERVALS = "MT:1-1000";
final String LSVINTERVALS = "20:40,000,000-41,000,000"; final String LSVINTERVALS = "20:40,500,000-41,000,000";
final String LSVINTERVALS_SHORT = "20:40,500,000-40,501,000";
final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf"; final String NA12891_CALLS = comparisonDataLocation + "Unvalidated/mtDNA/NA12891.snp.vcf";
final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf"; final String NA12878_WG_CALLS = comparisonDataLocation + "Unvalidated/NA12878/CEUTrio.HiSeq.WGS.b37_decoy.recal.ts_95.snp_indel_combined.vcf";
final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf"; final String LSV_ALLELES = validationDataLocation + "ALL.chr20_40m_41m.largeScaleValidationSites.vcf";
@ -38,6 +39,13 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
executeTest("testPoolCaller:"+name+" args=" + args, spec); executeTest("testPoolCaller:"+name+" args=" + args, spec);
} }
private void PC_LSV_Test_short(String args, String name, String model, String md5) {
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s --reference_sample_calls %s -refsample %s -glm %s -ignoreLane ",
REF, LSV_BAM, LSVINTERVALS_SHORT, NA12878_WG_CALLS, REFSAMPLE_NAME, model) + " --no_cmdline_in_header -o %s";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testPoolCaller:"+name+" args=" + args, spec);
}
private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) { private void PC_LSV_Test_NoRef(String args, String name, String model, String md5) {
final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane", final String base = String.format("-T UnifiedGenotyper -dcov 10000 -R %s -I %s -L %s -glm %s -ignoreLane",
REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s"; REF, LSV_BAM, LSVINTERVALS, model) + " --no_cmdline_in_header -o %s";
@ -45,33 +53,38 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest {
executeTest("testPoolCaller:"+name+" args=" + args, spec); executeTest("testPoolCaller:"+name+" args=" + args, spec);
} }
@Test(enabled = true)
public void testSNP_ACS_Pools() {
PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","df0e67c975ef74d593f1c704daab1705");
}
@Test(enabled = true) @Test(enabled = true)
public void testBOTH_GGA_Pools() { public void testBOTH_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","0934f72865388999efec64bd9d4a9b93"); PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","7e5b28c9e21cc7e45c58c41177d8a0fc");
} }
@Test(enabled = true) @Test(enabled = true)
public void testINDEL_GGA_Pools() { public void testINDEL_GGA_Pools() {
PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","126581c72d287722437274d41b6fed7b"); PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","ae6c276cc46785a794acff6f7d10ecf7");
} }
@Test(enabled = true) @Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","b543aa1c3efedb301e525c1d6c50ed8d"); PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","481452ad7d6378cffb5cd834cc621d55");
} }
@Test(enabled = true) @Test(enabled = true)
public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() {
PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","55b20557a836bb92688e68f12d7f5dc4"); PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","812957e51277aca9925c1a7bb4d9a118");
} }
@Test(enabled = true) @Test(enabled = true)
public void testMT_SNP_DISCOVERY_sp4() { public void testMT_SNP_DISCOVERY_sp4() {
PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","7eb889e8e07182f4c3d64609591f9459"); PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","dd568dc30be90135a3a8957a45a7321c");
} }
@Test(enabled = true) @Test(enabled = true)
public void testMT_SNP_GGA_sp10() { public void testMT_SNP_GGA_sp10() {
PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "db8114877b99b14f7180fdcd24b040a7"); PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "bf793c43b635a931207170be8035b288");
} }
} }

View File

@ -0,0 +1,87 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
public class AFCalcPerformanceUnitTest extends BaseTest {
@DataProvider(name = "ScalingTests")
public Object[][] makepolyTestProviderLotsOfAlleles() {
List<Object[]> tests = new ArrayList<Object[]>();
// list of all high-quality models in the system
final List<AFCalcFactory.Calculation> biAllelicModels = Arrays.asList(
AFCalcFactory.Calculation.EXACT_INDEPENDENT,
AFCalcFactory.Calculation.EXACT_REFERENCE);
final List<AFCalcFactory.Calculation> multiAllelicModels = Arrays.asList(
AFCalcFactory.Calculation.EXACT_INDEPENDENT);
// for ( final int nonTypePLs : Arrays.asList(100) ) {
// for ( final int nSamples : Arrays.asList(10000) ) {
// final List<Integer> alleleCounts = Arrays.asList(50);
// for ( final int nAltAlleles : Arrays.asList(1) ) {
for ( final int nonTypePLs : Arrays.asList(100) ) {
for ( final int nSamples : Arrays.asList(100, 1000) ) {
final List<Integer> alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 50, 500);
for ( final int nAltAlleles : Arrays.asList(1, 2, 3) ) {
final List<AFCalcFactory.Calculation> models = nAltAlleles > 1 ? multiAllelicModels : biAllelicModels;
for ( final AFCalcFactory.Calculation model : models ) {
for ( final List<Integer> ACs : Utils.makePermutations(alleleCounts, nAltAlleles, true) ) {
if ( MathUtils.sum(ACs) < nSamples * 2 ) {
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, nAltAlleles, model, AFCalcTestBuilder.PriorType.human);
tests.add(new Object[]{testBuilder, ACs, nonTypePLs});
}
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
private Pair<Integer, Integer> estNumberOfEvaluations(final AFCalcTestBuilder testBuilder, final VariantContext vc, final int nonTypePL) {
final int evalOverhead = 2; // 2
final int maxEvalsPerSamplePerAC = 3;
int minEvals = 0, maxEvals = 0;
for ( final Allele alt : vc.getAlternateAlleles() ) {
final int AC = vc.getCalledChrCount(alt);
minEvals += AC + evalOverhead; // everyone is hom-var
maxEvals += AC * maxEvalsPerSamplePerAC + 10;
}
return new Pair<Integer, Integer>(minEvals, maxEvals);
}
@Test(dataProvider = "ScalingTests")
private void testScaling(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
final AFCalcResult result = calc.getLog10PNonRef(vc, priors);
final Pair<Integer, Integer> expectedNEvaluation = estNumberOfEvaluations(testBuilder, vc, nonTypePL);
final int minEvals = expectedNEvaluation.getFirst();
final int maxEvals = expectedNEvaluation.getSecond();
logger.warn(" min " + minEvals + " obs " + result.getnEvaluations() + " max " + maxEvals + " for test " + testBuilder + " sum(ACs)=" + (int)MathUtils.sum(ACs));
Assert.assertTrue(result.getnEvaluations() >= minEvals,
"Actual number of evaluations " + result.getnEvaluations() + " < min number of evals " + minEvals);
Assert.assertTrue(result.getnEvaluations() <= maxEvals,
"Actual number of evaluations " + result.getnEvaluations() + " > max number of evals " + minEvals);
}
}

View File

@ -0,0 +1,82 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class AFCalcResultUnitTest extends BaseTest {
private static class MyTest {
final double[] Ls, expectedPosteriors;
private MyTest(double[] ls, double[] expectedPosteriors) {
Ls = ls;
this.expectedPosteriors = expectedPosteriors;
}
@Override
public String toString() {
return "Ls [" + Utils.join(",", Ls) + "] expectedPosteriors [" + Utils.join(",", expectedPosteriors) + "]";
}
}
@DataProvider(name = "TestComputePosteriors")
public Object[][] makeTestCombineGLs() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{new MyTest(log10Even, log10Even)});
for ( double L0 = -1e9; L0 < 0.0; L0 /= 10.0 ) {
for ( double L1 = -1e2; L1 < 0.0; L1 /= 100.0 ) {
final double[] input = new double[]{L0, L1};
final double[] expected = MathUtils.normalizeFromLog10(input, true);
tests.add(new Object[]{new MyTest(input, expected)});
}
}
for ( double bigBadL = -1e50; bigBadL < -1e200; bigBadL *= 10 ) {
// test that a huge bad likelihood remains, even with a massive better result
for ( final double betterL : Arrays.asList(-1000.0, -100.0, -10.0, -1.0, -0.1, -0.01, -0.001, 0.0)) {
tests.add(new Object[]{new MyTest(new double[]{bigBadL, betterL}, new double[]{bigBadL, 0.0})});
tests.add(new Object[]{new MyTest(new double[]{betterL, bigBadL}, new double[]{0.0, bigBadL})});
}
}
// test that a modest bad likelihood with an ~0.0 value doesn't get lost
for ( final double badL : Arrays.asList(-10000.0, -1000.0, -100.0, -10.0)) {
tests.add(new Object[]{new MyTest(new double[]{badL, -1e-9}, new double[]{badL, 0.0})});
tests.add(new Object[]{new MyTest(new double[]{-1e-9, badL}, new double[]{0.0, badL})});
}
// test that a non-ref site gets reasonable posteriors with an ~0.0 value doesn't get lost
for ( final double nonRefL : Arrays.asList(-100.0, -50.0, -10.0, -9.0, -8.0, -7.0, -6.0, -5.0)) {
tests.add(new Object[]{new MyTest(new double[]{0.0, nonRefL}, new double[]{0.0, nonRefL})});
}
return tests.toArray(new Object[][]{});
}
final static double[] log10Even = MathUtils.normalizeFromLog10(new double[]{0.5, 0.5}, true);
final static Allele C = Allele.create("C");
final static List<Allele> alleles = Arrays.asList(Allele.create("A", true), C);
@Test(enabled = true, dataProvider = "TestComputePosteriors")
private void testComputingPosteriors(final MyTest data) {
final AFCalcResult result = new AFCalcResult(new int[]{0}, 1, alleles, data.Ls, log10Even, Collections.singletonMap(C, -1.0));
Assert.assertEquals(result.getLog10PosteriorOfAFEq0(), data.expectedPosteriors[0], 1e-3, "AF = 0 not expected");
Assert.assertEquals(result.getLog10PosteriorOfAFGT0(), data.expectedPosteriors[1], 1e-3, "AF > 0 not expected");
final double[] actualPosteriors = new double[]{result.getLog10PosteriorOfAFEq0(), result.getLog10PosteriorOfAFGT0()};
Assert.assertEquals(MathUtils.sumLog10(actualPosteriors), 1.0, 1e-3, "Posteriors don't sum to 1 with 1e-3 precision");
}
}

View File

@ -0,0 +1,687 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.apache.commons.lang.ArrayUtils;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.QualityUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.*;
import org.testng.Assert;
import org.testng.annotations.BeforeSuite;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
public class AFCalcUnitTest extends BaseTest {
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
static int sampleNameCounter = 0;
static Genotype AA1, AB1, BB1, NON_INFORMATIVE1;
static Genotype AA2, AB2, AC2, BB2, BC2, CC2, NON_INFORMATIVE2;
final double[] FLAT_3SAMPLE_PRIORS = MathUtils.normalizeFromLog10(new double[2*3+1], true); // flat priors
final private static boolean INCLUDE_BIALLELIC = true;
final private static boolean INCLUDE_TRIALLELIC = true;
final private static boolean Guillermo_FIXME = false; // TODO -- can only be enabled when GdA fixes bug
final private static boolean DEBUG_ONLY = false;
@BeforeSuite
public void before() {
AA1 = makePL(Arrays.asList(A, A), 0, 20, 20);
AB1 = makePL(Arrays.asList(A, C), 20, 0, 20);
BB1 = makePL(Arrays.asList(C, C), 20, 20, 0);
NON_INFORMATIVE1 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0);
AA2 = makePL(Arrays.asList(A, A), 0, 20, 20, 20, 20, 20);
AB2 = makePL(Arrays.asList(A, C), 20, 0, 20, 20, 20, 20);
BB2 = makePL(Arrays.asList(C, C), 20, 20, 0, 20, 20, 20);
AC2 = makePL(Arrays.asList(A, G), 20, 20, 20, 0, 20, 20);
BC2 = makePL(Arrays.asList(C, G), 20, 20, 20, 20, 0, 20);
CC2 = makePL(Arrays.asList(G, G), 20, 20, 20, 20, 20, 0);
NON_INFORMATIVE2 = makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), 0, 0, 0, 0, 0, 0);
}
protected static Genotype makePL(final List<Allele> expectedGT, int ... pls) {
GenotypeBuilder gb = new GenotypeBuilder("sample" + sampleNameCounter++);
gb.alleles(expectedGT);
gb.PL(pls);
return gb.make();
}
private class GetGLsTest extends TestDataProvider {
GenotypesContext GLs;
int numAltAlleles;
final AFCalc calc;
final int[] expectedACs;
final double[] priors;
final String priorName;
private GetGLsTest(final AFCalc calc, int numAltAlleles, List<Genotype> arg, final double[] priors, final String priorName) {
super(GetGLsTest.class);
GLs = GenotypesContext.create(new ArrayList<Genotype>(arg));
this.numAltAlleles = numAltAlleles;
this.calc = calc;
this.priors = priors;
this.priorName = priorName;
expectedACs = new int[numAltAlleles+1];
for ( int alleleI = 0; alleleI < expectedACs.length; alleleI++ ) {
expectedACs[alleleI] = 0;
final Allele allele = getAlleles().get(alleleI);
for ( Genotype g : arg ) {
expectedACs[alleleI] += Collections.frequency(g.getAlleles(), allele);
}
}
}
public AFCalcResult execute() {
return getCalc().getLog10PNonRef(getVC(), getPriors());
}
public AFCalcResult executeRef() {
final AFCalc ref = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_REFERENCE, getCalc().nSamples, getCalc().getMaxAltAlleles());
return ref.getLog10PNonRef(getVC(), getPriors());
}
public double[] getPriors() {
return priors;
}
public AFCalc getCalc() {
return calc;
}
public VariantContext getVC() {
VariantContextBuilder builder = new VariantContextBuilder("test", "1", 1, 1, getAlleles());
builder.genotypes(GLs);
return builder.make();
}
public List<Allele> getAlleles() {
return Arrays.asList(Allele.create("A", true),
Allele.create("C"),
Allele.create("G"),
Allele.create("T")).subList(0, numAltAlleles+1);
}
public int getExpectedAltAC(final int alleleI) {
return expectedACs[alleleI+1];
}
public String toString() {
return String.format("%s model=%s prior=%s input=%s", super.toString(), calc.getClass().getSimpleName(),
priorName, GLs.size() > 5 ? String.format("%d samples", GLs.size()) : GLs);
}
}
@DataProvider(name = "wellFormedGLs")
public Object[][] createSimpleGLsData() {
final List<Genotype> biAllelicSamples = Arrays.asList(AA1, AB1, BB1);
final List<Genotype> triAllelicSamples = Arrays.asList(AA2, AB2, BB2, AC2, BC2, CC2);
for ( final int nSamples : Arrays.asList(1, 2, 3, 4) ) {
List<AFCalc> calcs = AFCalcFactory.createAFCalcs( Arrays.asList( AFCalcFactory.Calculation.values() ), 4, 2, 2);
final int nPriorValues = 2*nSamples+1;
final double[] flatPriors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
final double[] humanPriors = new double[nPriorValues];
UnifiedGenotyperEngine.computeAlleleFrequencyPriors(nPriorValues - 1, humanPriors, 0.001);
for ( final double[] priors : Arrays.asList(flatPriors, humanPriors) ) { // , humanPriors) ) {
for ( AFCalc model : calcs ) {
final String priorName = priors == humanPriors ? "human" : "flat";
// bi-allelic
if ( INCLUDE_BIALLELIC && nSamples <= biAllelicSamples.size() )
for ( List<Genotype> genotypes : Utils.makePermutations(biAllelicSamples, nSamples, true) )
new GetGLsTest(model, 1, genotypes, priors, priorName);
// tri-allelic
if ( INCLUDE_TRIALLELIC && ( ! priorName.equals("human") || Guillermo_FIXME ) && ! ( model instanceof OriginalDiploidExactAFCalc) ) // || model != generalCalc ) )
for ( List<Genotype> genotypes : Utils.makePermutations(triAllelicSamples, nSamples, true) )
new GetGLsTest(model, 2, genotypes, priors, priorName);
}
}
}
return GetGLsTest.getTests(GetGLsTest.class);
}
// @DataProvider(name = "badGLs")
// public Object[][] createBadGLs() {
// final List<Genotype> genotypes = Arrays.asList(AB2, BB2, CC2, CC2);
// final int nSamples = genotypes.size();
//
// final AFCalc indCalc = AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, nSamples, 4);
//
// final int nPriorValues = 2*nSamples+1;
// final double[] priors = MathUtils.normalizeFromLog10(new double[nPriorValues], true); // flat priors
// for ( AFCalc model : Arrays.asList(indCalc) ) {
// final String priorName = "flat";
// new GetGLsTest(model, 2, genotypes, priors, priorName);
// }
//
// return GetGLsTest.getTests(GetGLsTest.class);
// }
//
// @Test(enabled = true && !DEBUG_ONLY, dataProvider = "badGLs")
// public void testBadGLs(GetGLsTest cfg) {
// testResultSimple(cfg);
// }
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
public void testBiallelicGLs(GetGLsTest cfg) {
if ( cfg.getAlleles().size() == 2 )
testResultSimple(cfg);
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "wellFormedGLs")
public void testTriallelicGLs(GetGLsTest cfg) {
if ( cfg.getAlleles().size() > 2 )
testResultSimple(cfg);
}
private static class NonInformativeData {
final Genotype nonInformative;
final List<Genotype> called;
final int nAltAlleles;
private NonInformativeData(List<Genotype> called, Genotype nonInformative, int nAltAlleles) {
this.called = called;
this.nonInformative = nonInformative;
this.nAltAlleles = nAltAlleles;
}
}
@DataProvider(name = "GLsWithNonInformative")
public Object[][] makeGLsWithNonInformative() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<NonInformativeData> nonInformativeTests = new LinkedList<NonInformativeData>();
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB1), NON_INFORMATIVE1, 1));
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2), NON_INFORMATIVE2, 2));
nonInformativeTests.add(new NonInformativeData(Arrays.asList(AB2, BC2), NON_INFORMATIVE2, 2));
for ( final int nNonInformative : Arrays.asList(1, 10, 100) ) {
for ( final NonInformativeData testData : nonInformativeTests ) {
final List<Genotype> samples = new ArrayList<Genotype>();
samples.addAll(testData.called);
samples.addAll(Collections.nCopies(nNonInformative, testData.nonInformative));
final int nSamples = samples.size();
List<AFCalc> calcs = AFCalcFactory.createAFCalcs(Arrays.asList(AFCalcFactory.Calculation.values()), 4, 2, 2);
final double[] priors = MathUtils.normalizeFromLog10(new double[2*nSamples+1], true); // flat priors
for ( AFCalc model : calcs ) {
if ( testData.nAltAlleles > 1 && model instanceof OriginalDiploidExactAFCalc )
continue;
final GetGLsTest onlyInformative = new GetGLsTest(model, testData.nAltAlleles, testData.called, priors, "flat");
for ( int rotation = 0; rotation < nSamples; rotation++ ) {
Collections.rotate(samples, 1);
final GetGLsTest withNonInformative = new GetGLsTest(model, testData.nAltAlleles, samples, priors, "flat");
tests.add(new Object[]{onlyInformative, withNonInformative});
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "GLsWithNonInformative", dependsOnMethods = {"testBiallelicGLs", "testTriallelicGLs"})
public void testGLsWithNonInformative(GetGLsTest onlyInformative, GetGLsTest withNonInformative) {
final AFCalcResult expected = onlyInformative.execute();
final AFCalcResult actual = withNonInformative.execute();
testResultSimple(withNonInformative);
compareAFCalcResults(actual, expected, onlyInformative.getCalc(), true);
}
private void testResultSimple(final GetGLsTest cfg) {
final AFCalcResult refResultTracker = cfg.executeRef();
final AFCalcResult resultTracker = cfg.execute();
compareAFCalcResults(resultTracker, refResultTracker, cfg.getCalc(), true);
Assert.assertNotNull(resultTracker.getAllelesUsedInGenotyping());
Assert.assertTrue(cfg.getAlleles().containsAll(resultTracker.getAllelesUsedInGenotyping()), "Result object has alleles not in our initial allele list");
for ( int altAlleleI = 0; altAlleleI < cfg.numAltAlleles; altAlleleI++ ) {
int expectedAlleleCount = cfg.getExpectedAltAC(altAlleleI);
int calcAC_MLE = resultTracker.getAlleleCountsOfMLE()[altAlleleI];
final Allele allele = cfg.getAlleles().get(altAlleleI+1);
Assert.assertEquals(calcAC_MLE, expectedAlleleCount, "MLE AC not equal to expected AC for allele " + allele);
}
}
private void compareAFCalcResults(final AFCalcResult actual, final AFCalcResult expected, final AFCalc calc, final boolean onlyPosteriorsShouldBeEqual) {
// note we cannot really test the multi-allelic case because we actually meaningfully differ among the models here
final double TOLERANCE = calc.getMaxAltAlleles() > 1 ? 1000 : 0.1; // much tighter constraints on bi-allelic results
if ( ! onlyPosteriorsShouldBeEqual ) {
Assert.assertEquals(actual.getLog10PriorOfAFEq0(), expected.getLog10PriorOfAFEq0(), TOLERANCE, "Priors AF == 0");
Assert.assertEquals(actual.getLog10PriorOfAFGT0(), expected.getLog10PriorOfAFGT0(), TOLERANCE, "Priors AF > 0");
Assert.assertEquals(actual.getLog10LikelihoodOfAFEq0(), expected.getLog10LikelihoodOfAFEq0(), TOLERANCE, "Likelihoods AF == 0");
Assert.assertEquals(actual.getLog10LikelihoodOfAFGT0(), expected.getLog10LikelihoodOfAFGT0(), TOLERANCE, "Likelihoods AF > 0");
}
Assert.assertEquals(actual.getLog10PosteriorOfAFEq0(), expected.getLog10PosteriorOfAFEq0(), TOLERANCE, "Posteriors AF == 0");
Assert.assertEquals(actual.getLog10PosteriorOfAFGT0(), expected.getLog10PosteriorOfAFGT0(), TOLERANCE, "Posteriors AF > 0");
Assert.assertEquals(actual.getAlleleCountsOfMLE(), expected.getAlleleCountsOfMLE(), "MLE ACs");
Assert.assertEquals(actual.getAllelesUsedInGenotyping(), expected.getAllelesUsedInGenotyping(), "Alleles used in genotyping");
for ( final Allele a : expected.getAllelesUsedInGenotyping() ) {
if ( ! a.isReference() ) {
Assert.assertEquals(actual.getAlleleCountAtMLE(a), expected.getAlleleCountAtMLE(a), "MLE AC for allele " + a);
// TODO -- enable me when IndependentAllelesDiploidExactAFCalc works properly
// if ( ! ( calc instanceof GeneralPloidyExactAFCalc ) )
// // TODO -- delete when general ploidy works properly with multi-allelics
// Assert.assertEquals(actual.isPolymorphic(a, 0.0), expected.isPolymorphic(a, 0.0), "isPolymorphic with thread 0.0 for allele " + a);
}
}
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
public void testLargeGLs(final ExactAFCalc calc) {
final Genotype BB = makePL(Arrays.asList(C, C), 20000000, 20000000, 0);
GetGLsTest cfg = new GetGLsTest(calc, 1, Arrays.asList(BB, BB, BB), FLAT_3SAMPLE_PRIORS, "flat");
final AFCalcResult resultTracker = cfg.execute();
int calculatedAlleleCount = resultTracker.getAlleleCountsOfMLE()[0];
Assert.assertEquals(calculatedAlleleCount, 6);
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
public void testMismatchedGLs(final ExactAFCalc calc) {
final Genotype AB = makePL(Arrays.asList(A, C), 2000, 0, 2000, 2000, 2000, 2000);
final Genotype AC = makePL(Arrays.asList(A, G), 100, 100, 100, 0, 100, 100);
GetGLsTest cfg = new GetGLsTest(calc, 2, Arrays.asList(AB, AC), FLAT_3SAMPLE_PRIORS, "flat");
final AFCalcResult resultTracker = cfg.execute();
Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[0], 1);
Assert.assertEquals(resultTracker.getAlleleCountsOfMLE()[1], 1);
}
// --------------------------------------------------------------------------------
//
// Code to test that the pNonRef value is meaningful
//
// --------------------------------------------------------------------------------
private static class PNonRefData {
final Genotype g;
final double pNonRef, tolerance;
final boolean canScale;
final List<AFCalcFactory.Calculation> badModels;
final VariantContext vc;
private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale) {
this(vc, g, pNonRef, tolerance, canScale, Collections.<AFCalcFactory.Calculation>emptyList());
}
private PNonRefData(final VariantContext vc, Genotype g, double pNonRef, double tolerance, final boolean canScale, final List<AFCalcFactory.Calculation> badModels) {
this.g = g;
this.pNonRef = pNonRef;
this.tolerance = tolerance;
this.canScale = canScale;
this.badModels = badModels;
this.vc = vc;
}
public PNonRefData scale(final int scaleFactor) {
if ( canScale ) {
final int[] PLs = new int[g.getPL().length];
for ( int i = 0; i < PLs.length; i++ ) PLs[i] = g.getPL()[i] * ((int)Math.log10(scaleFactor)+1);
final Genotype scaledG = new GenotypeBuilder(g).PL(PLs).make();
final double scaledPNonRef = pNonRef < 0.5 ? pNonRef / scaleFactor : 1 - ((1-pNonRef) / scaleFactor);
return new PNonRefData(vc, scaledG, scaledPNonRef, tolerance, true);
} else {
return this;
}
}
}
@DataProvider(name = "PNonRef")
public Object[][] makePNonRefTest() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Allele> AA = Arrays.asList(A, A);
final List<Allele> AC = Arrays.asList(A, C);
final List<Allele> CC = Arrays.asList(C, C);
final List<Allele> AG = Arrays.asList(A, G);
final List<Allele> GG = Arrays.asList(G, G);
final List<Allele> CG = Arrays.asList(C, G);
final VariantContext vc2 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C)).make();
final VariantContext vc3 = new VariantContextBuilder("x","1", 1, 1, Arrays.asList(A, C, G)).make();
final AFCalcTestBuilder.PriorType priorType = AFCalcTestBuilder.PriorType.flat;
final double TOLERANCE = 0.5;
final List<PNonRefData> initialPNonRefData = Arrays.asList(
// bi-allelic sites
new PNonRefData(vc2, makePL(AA, 0, 10, 10), 0.1666667, TOLERANCE, true),
new PNonRefData(vc2, makePL(AA, 0, 1, 10), 0.4721084, TOLERANCE, false),
new PNonRefData(vc2, makePL(AA, 0, 1, 1), 0.6136992, TOLERANCE, false),
new PNonRefData(vc2, makePL(AA, 0, 5, 5), 0.3874259, TOLERANCE, false),
new PNonRefData(vc2, makePL(AC, 10, 0, 10), 0.9166667, TOLERANCE, true),
new PNonRefData(vc2, makePL(CC, 10, 10, 0), 0.9166667, TOLERANCE, true),
// tri-allelic sites -- cannot scale because of the naivety of our scaling estimator
new PNonRefData(vc3, makePL(AA, 0, 10, 10, 10, 10, 10), 0.3023255813953489, TOLERANCE * 2, false), // more tolerance because constrained model is a bit inaccurate
new PNonRefData(vc3, makePL(AC, 10, 0, 10, 10, 10, 10), 0.9166667, TOLERANCE, false),
new PNonRefData(vc3, makePL(CC, 10, 10, 0, 10, 10, 10), 0.9166667, TOLERANCE, false),
new PNonRefData(vc3, makePL(AG, 10, 10, 10, 0, 10, 10), 0.9166667, TOLERANCE, false),
new PNonRefData(vc3, makePL(CG, 10, 10, 10, 10, 0, 10), 0.80, TOLERANCE, false),
new PNonRefData(vc3, makePL(GG, 10, 10, 10, 10, 10, 0), 0.9166667, TOLERANCE, false)
);
for ( AFCalcFactory.Calculation modelType : Arrays.asList(AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcFactory.Calculation.EXACT_INDEPENDENT) ) {
for ( int nNonInformative = 0; nNonInformative < 3; nNonInformative++ ) {
for ( final PNonRefData rootData : initialPNonRefData ) {
for ( int plScale = 1; plScale <= 100000; plScale *= 10 ) {
if ( ! rootData.badModels.contains(modelType) && (plScale == 1 || rootData.canScale) ) {
final PNonRefData data = rootData.scale(plScale);
tests.add(new Object[]{data.vc, modelType, priorType, Arrays.asList(data.g), data.pNonRef, data.tolerance, nNonInformative});
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRef")
private void testPNonRef(final VariantContext vcRoot,
AFCalcFactory.Calculation modelType,
AFCalcTestBuilder.PriorType priorType,
final List<Genotype> genotypes,
final double expectedPNonRef,
final double tolerance,
final int nNonInformative) {
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(1, vcRoot.getNAlleles()-1, modelType, priorType);
final VariantContextBuilder vcb = new VariantContextBuilder(vcRoot);
vcb.genotypes(genotypes);
final AFCalcResult resultTracker = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), Math.log10(expectedPNonRef), tolerance,
"Actual pNonRef not within tolerance " + tolerance + " of expected");
}
@DataProvider(name = "PNonRefBiallelicSystematic")
public Object[][] makePNonRefBiallelicSystematic() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Integer> bigNonRefPLs = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 50, 100, 1000);
final List<List<Integer>> bigDiploidPLs = removeBadPLs(Utils.makePermutations(bigNonRefPLs, 3, true));
for ( AFCalcFactory.Calculation modelType : AFCalcFactory.Calculation.values() ) {
if ( false ) { // for testing only
tests.add(new Object[]{modelType, toGenotypes(Arrays.asList(Arrays.asList(0,100,0)))});
} else {
if ( modelType == AFCalcFactory.Calculation.EXACT_GENERAL_PLOIDY ) continue; // TODO -- GENERAL_PLOIDY DOESN'T WORK
// test all combinations of PLs for 1 sample
for ( final List<List<Integer>> PLsPerSample : Utils.makePermutations(bigDiploidPLs, 1, true) ) {
tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
}
final List<List<Integer>> smallDiploidPLs = new LinkedList<List<Integer>>();
for ( final int nonRefPL : Arrays.asList(5, 10, 20, 30) ) {
for ( int i = 0; i < 2; i++ ) {
List<Integer> pls = new ArrayList<Integer>(Collections.nCopies(3, nonRefPL));
pls.set(i, 0);
smallDiploidPLs.add(pls);
}
}
for ( final List<List<Integer>> PLsPerSample : Utils.makePermutations(smallDiploidPLs, 5, false) ) {
tests.add(new Object[]{modelType, toGenotypes(PLsPerSample)});
}
}
}
return tests.toArray(new Object[][]{});
}
final List<List<Integer>> removeBadPLs(List<List<Integer>> listOfPLs) {
List<List<Integer>> clean = new LinkedList<List<Integer>>();
for ( final List<Integer> PLs : listOfPLs ) {
int x = PLs.get(0);
boolean bad = false;
for ( int pl1 : PLs )
if ( pl1 > x )
bad = true;
else
x = pl1;
if ( ! bad ) clean.add(PLs);
}
return clean;
}
private List<Genotype> toGenotypes(final List<List<Integer>> PLsPerSample) {
final List<Allele> nocall = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL);
final List<Genotype> genotypes = new ArrayList<Genotype>(PLsPerSample.size());
for ( final List<Integer> PLs : PLsPerSample ) {
final int[] pls = ArrayUtils.toPrimitive(PLs.toArray(new Integer[3]));
final int min = MathUtils.arrayMin(pls);
for ( int i = 0; i < pls.length; i++ ) pls[i] -= min;
genotypes.add(makePL(nocall, pls));
}
return genotypes;
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "PNonRefBiallelicSystematic")
private void PNonRefBiallelicSystematic(AFCalcFactory.Calculation modelType, final List<Genotype> genotypes) {
//logger.warn("Running " + modelType + " with " + genotypes);
final AFCalcTestBuilder refBuilder = new AFCalcTestBuilder(genotypes.size(), 1, AFCalcFactory.Calculation.EXACT_REFERENCE, AFCalcTestBuilder.PriorType.human);
final AFCalcTestBuilder testBuilder = new AFCalcTestBuilder(genotypes.size(), 1, modelType, AFCalcTestBuilder.PriorType.human);
final VariantContextBuilder vcb = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A, C));
vcb.genotypes(genotypes);
final AFCalcResult refResult = refBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
final AFCalcResult testResult = testBuilder.makeModel().getLog10PNonRef(vcb.make(), testBuilder.makePriors());
final double tolerance = 1e-3;
Assert.assertEquals(testResult.getLog10PosteriorOfAFGT0(), refResult.getLog10PosteriorOfAFGT0(), tolerance,
"Actual pNonRef not within tolerance " + tolerance + " of expected");
Assert.assertEquals(testResult.getAlleleCountsOfMLE(), refResult.getAlleleCountsOfMLE(),
"Actual MLE " + Utils.join(",", testResult.getAlleleCountsOfMLE()) + " not equal to expected " + Utils.join(",", refResult.getAlleleCountsOfMLE()));
}
// --------------------------------------------------------------------------------
//
// Test priors
//
// --------------------------------------------------------------------------------
@DataProvider(name = "Models")
public Object[][] makeModels() {
List<Object[]> tests = new ArrayList<Object[]>();
for ( final AFCalcFactory.Calculation calc : AFCalcFactory.Calculation.values() ) {
if ( calc.usableForParams(2, 4) )
tests.add(new Object[]{AFCalcFactory.createAFCalc(calc, 2, 4)});
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true && !DEBUG_ONLY, dataProvider = "Models")
public void testBiallelicPriors(final AFCalc model) {
for ( int REF_PL = 10; REF_PL <= 20; REF_PL += 10 ) {
final Genotype AB = makePL(Arrays.asList(A,C), REF_PL, 0, 10000);
for ( int log10NonRefPrior = 1; log10NonRefPrior < 10*REF_PL; log10NonRefPrior += 1 ) {
final double refPrior = 1 - QualityUtils.qualToErrorProb(log10NonRefPrior);
final double nonRefPrior = (1-refPrior) / 2;
final double[] priors = MathUtils.normalizeFromLog10(MathUtils.toLog10(new double[]{refPrior, nonRefPrior, nonRefPrior}), true);
if ( ! Double.isInfinite(priors[1]) ) {
GetGLsTest cfg = new GetGLsTest(model, 1, Arrays.asList(AB), priors, "pNonRef" + log10NonRefPrior);
final AFCalcResult resultTracker = cfg.execute();
final int actualAC = resultTracker.getAlleleCountsOfMLE()[0];
final double pRefWithPrior = AB.getLikelihoods().getAsVector()[0] + priors[0];
final double pHetWithPrior = AB.getLikelihoods().getAsVector()[1] + priors[1] - Math.log10(0.5);
final double nonRefPost = Math.pow(10, pHetWithPrior) / (Math.pow(10, pRefWithPrior) + Math.pow(10, pHetWithPrior));
final double log10NonRefPost = Math.log10(nonRefPost);
if ( ! Double.isInfinite(log10NonRefPost) )
Assert.assertEquals(resultTracker.getLog10PosteriorOfAFGT0(), log10NonRefPost, 1e-2);
if ( nonRefPost >= 0.9 )
Assert.assertTrue(resultTracker.isPolymorphic(C, -1));
final int expectedMLEAC = 1; // the MLE is independent of the prior
Assert.assertEquals(actualAC, expectedMLEAC,
"actual AC with priors " + log10NonRefPrior + " not expected "
+ expectedMLEAC + " priors " + Utils.join(",", priors));
}
}
}
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "Models")
// --------------------------------------------------------------------------------
//
// Test that polymorphic sites (bi and tri) are properly called
//
// --------------------------------------------------------------------------------
@DataProvider(name = "polyTestProvider")
public Object[][] makePolyTestProvider() {
List<Object[]> tests = new ArrayList<Object[]>();
// list of all high-quality models in the system
final List<AFCalcFactory.Calculation> models = Arrays.asList(
AFCalcFactory.Calculation.getDefaultModel(),
AFCalcFactory.Calculation.EXACT_REFERENCE,
AFCalcFactory.Calculation.EXACT_INDEPENDENT);
// note that we cannot use small PLs here or the thresholds are hard to set
for ( final int nonTypePLs : Arrays.asList(100, 1000) ) {
for ( final AFCalcFactory.Calculation model : models ) {
for ( final int allele1AC : Arrays.asList(0, 1, 2, 10, 100, 1000, 10000) ) {
for ( final int nSamples : Arrays.asList(1, 10, 100, 1000, 10000) ) {
// for ( final int nonTypePLs : Arrays.asList(10) ) {
// for ( final AFCalcFactory.Calculation model : models ) {
// for ( final int allele1AC : Arrays.asList(100) ) {
// for ( final int nSamples : Arrays.asList(1000) ) {
if ( nSamples < allele1AC ) continue;
final double pPerSample = Math.pow(10, nonTypePLs / -10.0);
final double errorFreq = pPerSample * nSamples;
final boolean poly1 = allele1AC > errorFreq && (nonTypePLs * allele1AC) > 30;
// bi-allelic tests
{
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, 1, model, AFCalcTestBuilder.PriorType.human);
final List<Integer> ACs = Arrays.asList(allele1AC);
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1)});
}
// multi-allelic tests
for ( final int allele2AC : Arrays.asList(0, 1, 2, 10, 20, 50) ) {
if ( nSamples < allele2AC || allele1AC + allele2AC > nSamples || nSamples > 100 || nSamples == 1)
continue;
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, 2, model, AFCalcTestBuilder.PriorType.human);
final List<Integer> ACs = Arrays.asList(allele1AC, allele2AC);
final boolean poly2 = allele2AC > errorFreq && (nonTypePLs * allele2AC) > 90;
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, Arrays.asList(poly1, poly2)});
}
}
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProvider")
public void testCallingGeneral(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly ) {
testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
}
@DataProvider(name = "polyTestProviderLotsOfAlleles")
public Object[][] makepolyTestProviderLotsOfAlleles() {
List<Object[]> tests = new ArrayList<Object[]>();
// list of all high-quality models in the system
final List<AFCalcFactory.Calculation> models = Arrays.asList(AFCalcFactory.Calculation.EXACT_INDEPENDENT);
final List<Integer> alleleCounts = Arrays.asList(0, 1, 2, 3, 4, 5, 10, 20);
final int nonTypePLs = 1000;
final int nAlleles = 4;
for ( final AFCalcFactory.Calculation model : models ) {
for ( final List<Integer> ACs : Utils.makePermutations(alleleCounts, nAlleles, true) ) {
final List<Boolean> isPoly = new ArrayList<Boolean>(ACs.size());
for ( final int ac : ACs ) isPoly.add(ac > 0);
final double acSum = MathUtils.sum(ACs);
for ( final int nSamples : Arrays.asList(1, 10, 100) ) {
if ( nSamples < acSum ) continue;
final AFCalcTestBuilder testBuilder
= new AFCalcTestBuilder(nSamples, nAlleles, model, AFCalcTestBuilder.PriorType.human);
tests.add(new Object[]{testBuilder, ACs, nonTypePLs, isPoly});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(enabled = true && ! DEBUG_ONLY, dataProvider = "polyTestProviderLotsOfAlleles")
public void testCallingLotsOfAlleles(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly ) {
testCalling(testBuilder, ACs, nonTypePL, expectedPoly);
}
private void testCalling(final AFCalcTestBuilder testBuilder, final List<Integer> ACs, final int nonTypePL, final List<Boolean> expectedPoly) {
final AFCalc calc = testBuilder.makeModel();
final double[] priors = testBuilder.makePriors();
final VariantContext vc = testBuilder.makeACTest(ACs, 0, nonTypePL);
final AFCalcResult result = calc.getLog10PNonRef(vc, priors);
boolean anyPoly = false;
for ( final boolean onePoly : expectedPoly ) anyPoly = anyPoly || onePoly;
if ( anyPoly )
Assert.assertTrue(result.getLog10PosteriorOfAFGT0() > -1);
for ( int altI = 1; altI < result.getAllelesUsedInGenotyping().size(); altI++ ) {
final int i = altI - 1;
final Allele alt = result.getAllelesUsedInGenotyping().get(altI);
// must be getCalledChrCount because we cannot ensure that the VC made has our desired ACs
Assert.assertEquals(result.getAlleleCountAtMLE(alt), vc.getCalledChrCount(alt));
Assert.assertEquals(result.isPolymorphic(alt, -1), (boolean)expectedPoly.get(i), "isPolymorphic for allele " + alt + " " + result.getLog10PosteriorOfAFGt0ForAllele(alt));
}
}
}

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.walkers.genotyper; package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.gatk.walkers.genotyper.GeneralPloidyGenotypeLikelihoods;
import org.broadinstitute.sting.utils.variantcontext.Allele; import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype; import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder;
@ -136,18 +137,15 @@ public class GeneralPloidyAFCalculationModelUnitTest extends BaseTest {
@Test(dataProvider = "getGLs") @Test(dataProvider = "getGLs")
public void testGLs(GetGLsTest cfg) { public void testGLs(GetGLsTest cfg) {
final AlleleFrequencyCalculationResult result = new AlleleFrequencyCalculationResult(cfg.numAltAlleles);
final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size()); final int len = GeneralPloidyGenotypeLikelihoods.getNumLikelihoodElements(1 + cfg.numAltAlleles, cfg.ploidy * cfg.GLs.size());
double[] priors = new double[len]; // flat priors double[] priors = new double[len]; // flat priors
GeneralPloidyExactAFCalculationModel.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors, result); final GeneralPloidyExactAFCalc calc = new GeneralPloidyExactAFCalc(cfg.GLs.size(), 1 + cfg.numAltAlleles, cfg.ploidy);
calc.combineSinglePools(cfg.GLs, 1 + cfg.numAltAlleles, cfg.ploidy, priors);
int nameIndex = 1; int nameIndex = 1;
for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) { for ( int allele = 0; allele < cfg.numAltAlleles; allele++, nameIndex+=2 ) {
int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1)); int expectedAlleleCount = Integer.valueOf(cfg.name.substring(nameIndex, nameIndex+1));
int calculatedAlleleCount = result.getAlleleCountsOfMAP()[allele]; int calculatedAlleleCount = calc.getStateTracker().getAlleleCountsOfMAP()[allele];
// System.out.format( "%s Expected:%d Calc:%d\n",cfg.toString(),expectedAlleleCount, calculatedAlleleCount);
Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount); Assert.assertEquals(calculatedAlleleCount, expectedAlleleCount);
} }
} }

View File

@ -0,0 +1,176 @@
package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc;
import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.MathUtils;
import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.variantcontext.Allele;
import org.broadinstitute.sting.utils.variantcontext.Genotype;
import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import org.broadinstitute.sting.utils.variantcontext.VariantContextBuilder;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.*;
// SEE private/R/pls.R if you want the truth output for these tests
public class IndependentAllelesDiploidExactAFCalcUnitTest extends BaseTest {
@DataProvider(name = "TestCombineGLs")
public Object[][] makeTestCombineGLs() {
List<Object[]> tests = new ArrayList<Object[]>();
tests.add(new Object[]{1, 1, makePL( 0, 10, 20), makePL( 0, 10, 20)});
tests.add(new Object[]{1, 1, makePL(10, 0, 20), makePL(10, 0, 20)});
tests.add(new Object[]{1, 1, makePL(20, 10, 0), makePL(20, 10, 0)});
// AA AB BB AC BC CC => AA AB+BC CC
tests.add(new Object[]{1, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 10, 20)});
tests.add(new Object[]{2, 2, makePL( 0, 10, 20, 30, 40, 50), makePL(0, 30, 50)});
tests.add(new Object[]{1, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
tests.add(new Object[]{2, 2, makePL( 0, 10, 10, 10, 10, 10), makePL(0, 8, 11)});
tests.add(new Object[]{1, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 2, 5)});
tests.add(new Object[]{2, 2, makePL( 0, 1, 2, 3, 4, 5), makePL(0, 4, 9)});
tests.add(new Object[]{1, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
tests.add(new Object[]{2, 2, makePL( 0, 50, 50, 50, 50, 50), makePL( 0, 47, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 0, 50, 50, 50, 50), makePL(45, 0, 50)});
tests.add(new Object[]{2, 2, makePL( 50, 0, 50, 50, 50, 50), makePL( 0, 47, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 50, 0, 50, 50, 50), makePL(45, 47, 0)});
tests.add(new Object[]{2, 2, makePL( 50, 50, 0, 50, 50, 50), makePL( 0, 47, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(0, 47, 50)});
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 0, 50, 50), makePL(45, 0, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 0, 50), makePL(45, 0, 50)});
tests.add(new Object[]{1, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(0, 47, 50)});
tests.add(new Object[]{2, 2, makePL( 50, 50, 50, 50, 50, 0), makePL(45, 47, 0)});
return tests.toArray(new Object[][]{});
}
private Genotype makePL(final int ... PLs) {
return AFCalcUnitTest.makePL(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL), PLs);
}
@Test(enabled = true, dataProvider = "TestCombineGLs")
private void testCombineGLs(final int altIndex, final int nAlts, final Genotype testg, final Genotype expected) {
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
final Genotype combined = calc.combineGLs(testg, altIndex, nAlts);
Assert.assertEquals(combined.getPL(), expected.getPL(),
"Combined PLs " + Utils.join(",", combined.getPL()) + " != expected " + Utils.join(",", expected.getPL()));
}
static Allele A = Allele.create("A", true);
static Allele C = Allele.create("C");
static Allele G = Allele.create("G");
@DataProvider(name = "TestMakeAlleleConditionalContexts")
public Object[][] makeTestMakeAlleleConditionalContexts() {
List<Object[]> tests = new ArrayList<Object[]>();
final VariantContextBuilder root = new VariantContextBuilder("x", "1", 1, 1, Arrays.asList(A));
final VariantContextBuilder vcAC = new VariantContextBuilder(root).alleles(Arrays.asList(A, C));
final VariantContextBuilder vcAG = new VariantContextBuilder(root).alleles(Arrays.asList(A, G));
final VariantContextBuilder vcACG = new VariantContextBuilder(root).alleles(Arrays.asList(A, C, G));
final VariantContextBuilder vcAGC = new VariantContextBuilder(root).alleles(Arrays.asList(A, G, C));
final Genotype gACG = makePL( 0, 1, 2, 3, 4, 5);
final Genotype gAGC = makePL( 0, 4, 5, 1, 3, 2);
final Genotype gACcombined = makePL(0, 2, 5);
final Genotype gACcombined2 = makePL(0, 1, 4);
final Genotype gAGcombined = makePL(0, 4, 9);
// biallelic
tests.add(new Object[]{vcAC.genotypes(gACcombined).make(), Arrays.asList(vcAC.genotypes(gACcombined).make())});
// tri-allelic
tests.add(new Object[]{vcACG.genotypes(gACG).make(), Arrays.asList(vcAC.genotypes(gACcombined).make(), vcAG.genotypes(gAGcombined).make())});
tests.add(new Object[]{vcAGC.genotypes(gAGC).make(), Arrays.asList(vcAG.genotypes(gAGcombined).make(), vcAC.genotypes(gACcombined2).make())});
return tests.toArray(new Object[][]{});
}
@Test(enabled = true, dataProvider = "TestMakeAlleleConditionalContexts")
private void testMakeAlleleConditionalContexts(final VariantContext vc, final List<VariantContext> expectedVCs) {
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 4);
final List<VariantContext> biAllelicVCs = calc.makeAlleleConditionalContexts(vc);
Assert.assertEquals(biAllelicVCs.size(), expectedVCs.size());
for ( int i = 0; i < biAllelicVCs.size(); i++ ) {
final VariantContext actual = biAllelicVCs.get(i);
final VariantContext expected = expectedVCs.get(i);
Assert.assertEquals(actual.getAlleles(), expected.getAlleles());
for ( int j = 0; j < actual.getNSamples(); j++ )
Assert.assertEquals(actual.getGenotype(j).getPL(), expected.getGenotype(j).getPL(),
"expected PLs " + Utils.join(",", expected.getGenotype(j).getPL()) + " not equal to actual " + Utils.join(",", actual.getGenotype(j).getPL()));
}
}
@DataProvider(name = "ThetaNTests")
public Object[][] makeThetaNTests() {
List<Object[]> tests = new ArrayList<Object[]>();
final List<Double> log10LAlleles = Arrays.asList(0.0, -1.0, -2.0, -3.0, -4.0);
for ( final double log10pRef : Arrays.asList(-1, -2, -3) ) {
for ( final int ploidy : Arrays.asList(1, 2, 3, 4) ) {
for ( List<Double> permutations : Utils.makePermutations(log10LAlleles, ploidy, true)) {
tests.add(new Object[]{permutations, Math.pow(10, log10pRef)});
}
}
}
return tests.toArray(new Object[][]{});
}
@Test(dataProvider = "ThetaNTests")
public void testThetaNTests(final List<Double> log10LAlleles, final double pRef) {
// biallelic
final double[] rawPriors = MathUtils.toLog10(new double[]{pRef, 1-pRef});
final double log10pNonRef = Math.log10(1-pRef);
final List<AFCalcResult> originalPriors = new LinkedList<AFCalcResult>();
final List<Double> pNonRefN = new LinkedList<Double>();
for ( int i = 0; i < log10LAlleles.size(); i++ ) {
final double log10LAllele1 = log10LAlleles.get(i);
final double[] L1 = MathUtils.normalizeFromLog10(new double[]{log10LAllele1, 0.0}, true);
final AFCalcResult result1 = new AFCalcResult(new int[]{1}, 1, Arrays.asList(A, C), L1, rawPriors, Collections.singletonMap(C, 0.0));
originalPriors.add(result1);
pNonRefN.add(log10pNonRef*(i+1));
}
final IndependentAllelesDiploidExactAFCalc calc = (IndependentAllelesDiploidExactAFCalc)AFCalcFactory.createAFCalc(AFCalcFactory.Calculation.EXACT_INDEPENDENT, 1, 2);
final List<AFCalcResult> thetaNPriors = calc.applyMultiAllelicPriors(originalPriors);
double prevPosterior = 0.0;
for ( int i = 0; i < log10LAlleles.size(); i++ ) {
final AFCalcResult thetaN = thetaNPriors.get(i);
AFCalcResult orig = null;
for ( final AFCalcResult x : originalPriors )
if ( x.getAllelesUsedInGenotyping().equals(thetaN.getAllelesUsedInGenotyping()))
orig = x;
Assert.assertNotNull(orig, "couldn't find original AFCalc");
Assert.assertEquals(orig.getLog10PriorOfAFGT0(), log10pNonRef, 1e-6);
Assert.assertEquals(thetaN.getLog10PriorOfAFGT0(), pNonRefN.get(i), 1e-6);
Assert.assertTrue(orig.getLog10PosteriorOfAFGT0() <= prevPosterior, "AFCalc results should be sorted but " + prevPosterior + " is > original posterior " + orig.getLog10PosteriorOfAFGT0());
prevPosterior = orig.getLog10PosteriorOfAFGT0();
}
}
}

View File

@ -8,9 +8,10 @@ import java.util.Arrays;
public class HaplotypeCallerIntegrationTest extends WalkerTest { public class HaplotypeCallerIntegrationTest extends WalkerTest {
final static String REF = b37KGReference; final static String REF = b37KGReference;
final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam"; final String NA12878_BAM = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.bam";
final String NA12878_CHR20_BAM = validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam";
final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam"; final String CEUTRIO_BAM = validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam";
final String NA12878_RECALIBRATED_BAM = privateTestDir + "NA12878.100kb.BQSRv2.example.bam";
final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals"; final String INTERVALS_FILE = validationDataLocation + "NA12878.HiSeq.b37.chr20.10_11mb.test.intervals";
//final String RECAL_FILE = validationDataLocation + "NA12878.kmer.8.subset.recal_data.bqsr";
private void HCTest(String bam, String args, String md5) { private void HCTest(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3"; final String base = String.format("-T HaplotypeCaller -R %s -I %s -L %s", REF, bam, INTERVALS_FILE) + " --no_cmdline_in_header -o %s -minPruning 3";
@ -20,28 +21,77 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest {
@Test @Test
public void testHaplotypeCallerMultiSample() { public void testHaplotypeCallerMultiSample() {
HCTest(CEUTRIO_BAM, "", "6b30c7e1b6bbe80d180d9d67441cec12"); HCTest(CEUTRIO_BAM, "", "aa1df35d6e64d7ca93feb4d2dd15dd0e");
} }
@Test @Test
public void testHaplotypeCallerSingleSample() { public void testHaplotypeCallerSingleSample() {
HCTest(NA12878_BAM, "", "4cdfbfeadef00725974828310558d7d4"); HCTest(NA12878_BAM, "", "186c7f322978283c01249c6de2829215");
} }
@Test @Test
public void testHaplotypeCallerMultiSampleGGA() { public void testHaplotypeCallerMultiSampleGGA() {
HCTest(CEUTRIO_BAM, "-gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "6183fb6e374976d7087150009685e043"); HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "de9e78a52207fe62144dba5337965469");
} }
private void HCTestComplexVariants(String bam, String args, String md5) { private void HCTestComplexVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 3"; final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10028767-10028967 -L 20:10431524-10431924 -L 20:10723661-10724061 -L 20:10903555-10903955 --no_cmdline_in_header -o %s -minPruning 2";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5)); final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec); executeTest("testHaplotypeCallerComplexVariants: args=" + args, spec);
} }
@Test @Test
public void testHaplotypeCallerMultiSampleComplex() { public void testHaplotypeCallerMultiSampleComplex() {
HCTestComplexVariants(CEUTRIO_BAM, "", "ab7593a7a60a2e9a66053572f1718df1"); HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "000dbb1b48f94d017cfec127c6cabe8f");
}
private void HCTestSymbolicVariants(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:5947969-5948369 -L 20:61091236-61091636 --no_cmdline_in_header -o %s -minPruning 2";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerSymbolicVariants: args=" + args, spec);
}
@Test
public void testHaplotypeCallerSingleSampleSymbolic() {
HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "16013a9203367c3d1c4ce1dcdc81ef4a");
}
private void HCTestIndelQualityScores(String bam, String args, String md5) {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, bam) + " -L 20:10,005,000-10,025,000 --no_cmdline_in_header -o %s -minPruning 2";
final WalkerTestSpec spec = new WalkerTestSpec(base + " " + args, Arrays.asList(md5));
executeTest("testHaplotypeCallerIndelQualityScores: args=" + args, spec);
}
@Test
public void testHaplotypeCallerSingleSampleIndelQualityScores() {
HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "b369c2a6cb5c99a424551b33bae16f3b");
}
@Test
public void HCTestProblematicReadsModifiedInActiveRegions() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c306140ad28515ee06c603c225217939"));
executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec);
}
@Test
public void HCTestStructuralIndels() {
final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730";
final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6c67ee8e99cc8f53a6587bb26028047"));
executeTest("HCTestStructuralIndels: ", spec);
}
// --------------------------------------------------------------------------------------------------------------
//
// testing reduced reads
//
// --------------------------------------------------------------------------------------------------------------
@Test
public void HCTestReducedBam() {
WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec(
"-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1,
Arrays.asList("4beb9f87ab3f316a9384c3d0dca6ebe9"));
executeTest("HC calling on a ReducedRead BAM", spec);
} }
} }

View File

@ -23,24 +23,26 @@
*/ */
// our package // our package
package org.broadinstitute.sting.utils; package org.broadinstitute.sting.utils.pairhmm;
// the imports for unit testing. // the imports for unit testing.
import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.BaseTest;
import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.Utils;
import org.testng.Assert; import org.testng.Assert;
import org.testng.annotations.DataProvider; import org.testng.annotations.DataProvider;
import org.testng.annotations.Test; import org.testng.annotations.Test;
import java.util.*; import java.util.*;
public class PairHMMUnitTest extends BaseTest { public class PairHMMUnitTest extends BaseTest {
final static boolean EXTENSIVE_TESTING = true; final static boolean EXTENSIVE_TESTING = true;
PairHMM hmm = new PairHMM( false ); // reference implementation PairHMM exactHMM = new ExactPairHMM(); // the log truth implementation
PairHMM bandedHMM = new PairHMM( true ); // algorithm with banding PairHMM originalHMM = new OriginalPairHMM(); // the reference implementation
PairHMM cachingHMM = new CachingPairHMM();
PairHMM loglessHMM = new LoglessCachingPairHMM();
// -------------------------------------------------------------------------------- // --------------------------------------------------------------------------------
// //
@ -57,7 +59,7 @@ public class PairHMMUnitTest extends BaseTest {
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC"; final static String LEFT_FLANK = "GATTTATCATCGAGTCTGC";
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA"; final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTTA";
public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) { public BasicLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp ) {
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false); this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
} }
@ -76,115 +78,51 @@ public class PairHMMUnitTest extends BaseTest {
} }
public double expectedLogL() { public double expectedLogL() {
return expectedQual / -10.0; return (expectedQual / -10.0) + 0.03 ;
} }
public double tolerance() { public double toleranceFromTheoretical() {
return 0.1; // TODO FIXME arbitrary return 0.2;
} }
public double calcLogL() { public double toleranceFromReference() {
return 1E-4;
}
double logL = hmm.computeReadLikelihoodGivenHaplotype( public double toleranceFromExact() {
return 1E-9;
}
public double calcLogL( final PairHMM pairHMM, boolean anchorIndel ) {
pairHMM.initialize(readBasesWithContext.length, refBasesWithContext.length);
return pairHMM.computeReadLikelihoodGivenHaplotypeLog10(
refBasesWithContext, readBasesWithContext, refBasesWithContext, readBasesWithContext,
qualAsBytes(baseQual, false), qualAsBytes(insQual, true), qualAsBytes(delQual, true), qualAsBytes(baseQual, false, anchorIndel), qualAsBytes(insQual, true, anchorIndel), qualAsBytes(delQual, true, anchorIndel),
qualAsBytes(gcp, false)); qualAsBytes(gcp, false, anchorIndel), 0, true);
return logL;
} }
private final byte[] asBytes(final String bases, final boolean left, final boolean right) { private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes(); return ( (left ? LEFT_FLANK : "") + CONTEXT + bases + CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
} }
private byte[] qualAsBytes(final int phredQual, final boolean doGOP) { private byte[] qualAsBytes(final int phredQual, final boolean doGOP, final boolean anchorIndel) {
final byte phredQuals[] = new byte[readBasesWithContext.length]; final byte phredQuals[] = new byte[readBasesWithContext.length];
// initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
Arrays.fill(phredQuals, (byte)100);
// update just the bases corresponding to the provided micro read with the quality scores if( anchorIndel ) {
if( doGOP ) { // initialize everything to MASSIVE_QUAL so it cannot be moved by HMM
phredQuals[0 + CONTEXT.length()] = (byte)phredQual; Arrays.fill(phredQuals, (byte)100);
} else {
for ( int i = 0; i < read.length(); i++)
phredQuals[i + CONTEXT.length()] = (byte)phredQual;
}
return phredQuals; // update just the bases corresponding to the provided micro read with the quality scores
} if( doGOP ) {
} phredQuals[0 + CONTEXT.length()] = (byte)phredQual;
} else {
final Random random = new Random(87865573); for ( int i = 0; i < read.length(); i++)
private class BandedLikelihoodTestProvider extends TestDataProvider { phredQuals[i + CONTEXT.length()] = (byte)phredQual;
final String ref, read;
final byte[] refBasesWithContext, readBasesWithContext;
final int baseQual, insQual, delQual, gcp;
final int expectedQual;
final static String LEFT_CONTEXT = "ACGTAATGACGCTACATGTCGCCAACCGTC";
final static String RIGHT_CONTEXT = "TACGGCTTCATATAGGGCAATGTGTGTGGCAAAA";
final static String LEFT_FLANK = "GATTTATCATCGAGTCTGTT";
final static String RIGHT_FLANK = "CATGGATCGTTATCAGCTATCTCGAGGGATTCACTTAACAGTTTCCGTA";
final byte[] baseQuals, insQuals, delQuals, gcps;
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp) {
this(ref, read, baseQual, insQual, delQual, expectedQual, gcp, false, false);
}
public BandedLikelihoodTestProvider(final String ref, final String read, final int baseQual, final int insQual, final int delQual, final int expectedQual, final int gcp, final boolean left, final boolean right) {
super(BandedLikelihoodTestProvider.class, String.format("BANDED: ref=%s read=%s b/i/d/c quals = %d/%d/%d/%d l/r flank = %b/%b e[qual]=%d", ref, read, baseQual, insQual, delQual, gcp, left, right, expectedQual));
this.baseQual = baseQual;
this.delQual = delQual;
this.insQual = insQual;
this.gcp = gcp;
this.read = read;
this.ref = ref;
this.expectedQual = expectedQual;
refBasesWithContext = asBytes(ref, left, right);
readBasesWithContext = asBytes(read, false, false);
baseQuals = qualAsBytes(baseQual);
insQuals = qualAsBytes(insQual);
delQuals = qualAsBytes(delQual);
gcps = qualAsBytes(gcp, false);
}
public double expectedLogL() {
double logL = hmm.computeReadLikelihoodGivenHaplotype(
refBasesWithContext, readBasesWithContext,
baseQuals, insQuals, delQuals, gcps);
return logL;
}
public double tolerance() {
return 0.2; // TODO FIXME arbitrary
}
public double calcLogL() {
double logL = bandedHMM.computeReadLikelihoodGivenHaplotype(
refBasesWithContext, readBasesWithContext,
baseQuals, insQuals, delQuals, gcps);
return logL;
}
private final byte[] asBytes(final String bases, final boolean left, final boolean right) {
return ( (left ? LEFT_FLANK : "") + LEFT_CONTEXT + bases + RIGHT_CONTEXT + (right ? RIGHT_FLANK : "")).getBytes();
}
private byte[] qualAsBytes(final int phredQual) {
return qualAsBytes(phredQual, true);
}
private byte[] qualAsBytes(final int phredQual, final boolean addRandom) {
final byte phredQuals[] = new byte[readBasesWithContext.length];
Arrays.fill(phredQuals, (byte)phredQual);
if(addRandom) {
for( int iii = 0; iii < phredQuals.length; iii++) {
phredQuals[iii] = (byte) ((int) phredQuals[iii] + (random.nextInt(7) - 3));
} }
} else {
Arrays.fill(phredQuals, (byte)phredQual);
} }
return phredQuals; return phredQuals;
} }
} }
@ -195,8 +133,8 @@ public class PairHMMUnitTest extends BaseTest {
// test all combinations // test all combinations
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30); final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30, 40, 50) : Arrays.asList(30);
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40); final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 30, 40, 50) : Arrays.asList(40);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10); final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(8, 10, 20) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20,30,35) : Arrays.asList(2);
for ( final int baseQual : baseQuals ) { for ( final int baseQual : baseQuals ) {
for ( final int indelQual : indelQuals ) { for ( final int indelQual : indelQuals ) {
@ -219,7 +157,7 @@ public class PairHMMUnitTest extends BaseTest {
for ( boolean insertionP : Arrays.asList(true, false)) { for ( boolean insertionP : Arrays.asList(true, false)) {
final String small = Utils.dupString((char)base, 1); final String small = Utils.dupString((char)base, 1);
final String big = Utils.dupString((char)base, size); final String big = Utils.dupString((char) base, size);
final String ref = insertionP ? small : big; final String ref = insertionP ? small : big;
final String read = insertionP ? big : small; final String read = insertionP ? big : small;
@ -238,69 +176,65 @@ public class PairHMMUnitTest extends BaseTest {
return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class); return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
} }
@Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true) final Random random = new Random(87860573);
public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) { @DataProvider(name = "OptimizedLikelihoodTestProvider")
double calculatedLogL = cfg.calcLogL(); public Object[][] makeOptimizedLikelihoodTests() {
double expectedLogL = cfg.expectedLogL();
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance());
}
@DataProvider(name = "BandedLikelihoodTestProvider")
public Object[][] makeBandedLikelihoodTests() {
// context on either side is ACGTTGCA REF ACGTTGCA // context on either side is ACGTTGCA REF ACGTTGCA
// test all combinations // test all combinations
final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(25, 30, 40, 50) : Arrays.asList(30); final List<Integer> baseQuals = EXTENSIVE_TESTING ? Arrays.asList(10, 30, 40, 60) : Arrays.asList(30);
final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(30, 40, 50) : Arrays.asList(40); final List<Integer> indelQuals = EXTENSIVE_TESTING ? Arrays.asList(20, 40, 60) : Arrays.asList(40);
final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 12) : Arrays.asList(10); final List<Integer> gcps = EXTENSIVE_TESTING ? Arrays.asList(10, 20, 30) : Arrays.asList(10);
final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(2,3,4,5,7,8,9,10,20) : Arrays.asList(2); final List<Integer> sizes = EXTENSIVE_TESTING ? Arrays.asList(3, 20, 50, 90, 160) : Arrays.asList(2);
for ( final int baseQual : baseQuals ) { for ( final int baseQual : baseQuals ) {
for ( final int indelQual : indelQuals ) { for ( final int indelQual : indelQuals ) {
for ( final int gcp : gcps ) { for ( final int gcp : gcps ) {
for ( final int refSize : sizes ) {
// test substitutions for ( final int readSize : sizes ) {
for ( final byte refBase : BaseUtils.BASES ) { String ref = "";
for ( final byte readBase : BaseUtils.BASES ) { String read = "";
final String ref = new String(new byte[]{refBase}); for( int iii = 0; iii < refSize; iii++) {
final String read = new String(new byte[]{readBase}); ref += (char) BaseUtils.BASES[random.nextInt(4)];
final int expected = refBase == readBase ? 0 : baseQual;
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
}
}
// test insertions and deletions
for ( final int size : sizes ) {
for ( final byte base : BaseUtils.BASES ) {
final int expected = indelQual + (size - 2) * gcp;
for ( boolean insertionP : Arrays.asList(true, false)) {
final String small = Utils.dupString((char)base, 1);
final String big = Utils.dupString((char)base, size);
final String ref = insertionP ? small : big;
final String read = insertionP ? big : small;
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp);
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, false);
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, false, true);
new BandedLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, expected, gcp, true, true);
} }
for( int iii = 0; iii < readSize; iii++) {
read += (char) BaseUtils.BASES[random.nextInt(4)];
}
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp);
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, false);
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, false, true);
new BasicLikelihoodTestProvider(ref, read, baseQual, indelQual, indelQual, -0, gcp, true, true);
} }
} }
} }
} }
} }
return BandedLikelihoodTestProvider.getTests(BandedLikelihoodTestProvider.class); return BasicLikelihoodTestProvider.getTests(BasicLikelihoodTestProvider.class);
} }
@Test(dataProvider = "BandedLikelihoodTestProvider", enabled = true) @Test(dataProvider = "BasicLikelihoodTestProvider", enabled = true)
public void testBandedLikelihoods(BandedLikelihoodTestProvider cfg) { public void testBasicLikelihoods(BasicLikelihoodTestProvider cfg) {
double calculatedLogL = cfg.calcLogL(); double exactLogL = cfg.calcLogL( exactHMM, true );
double calculatedLogL = cfg.calcLogL( originalHMM, true );
double optimizedLogL = cfg.calcLogL( cachingHMM, true );
double loglessLogL = cfg.calcLogL( loglessHMM, true );
double expectedLogL = cfg.expectedLogL(); double expectedLogL = cfg.expectedLogL();
logger.warn(String.format("Test: logL calc=%.2f expected=%.2f for %s", calculatedLogL, expectedLogL, cfg.toString())); //logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.tolerance()); Assert.assertEquals(exactLogL, expectedLogL, cfg.toleranceFromTheoretical());
Assert.assertEquals(calculatedLogL, expectedLogL, cfg.toleranceFromTheoretical());
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
}
@Test(dataProvider = "OptimizedLikelihoodTestProvider", enabled = true)
public void testOptimizedLikelihoods(BasicLikelihoodTestProvider cfg) {
double exactLogL = cfg.calcLogL( exactHMM, false );
double calculatedLogL = cfg.calcLogL( originalHMM, false );
double optimizedLogL = cfg.calcLogL( cachingHMM, false );
double loglessLogL = cfg.calcLogL( loglessHMM, false );
//logger.warn(String.format("Test: logL calc=%.2f optimized=%.2f logless=%.2f expected=%.2f for %s", calculatedLogL, optimizedLogL, loglessLogL, expectedLogL, cfg.toString()));
Assert.assertEquals(optimizedLogL, calculatedLogL, cfg.toleranceFromReference());
Assert.assertEquals(loglessLogL, exactLogL, cfg.toleranceFromExact());
} }
@Test @Test
@ -322,11 +256,11 @@ public class PairHMMUnitTest extends BaseTest {
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset); byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length-offset);
// change single base at position k to C. If it's a C, change to T // change single base at position k to C. If it's a C, change to T
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
double res1 = hmm.computeReadLikelihoodGivenHaplotype( originalHMM.initialize(mread.length, haplotype1.length);
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
haplotype1, mread, haplotype1, mread,
quals, gop, gop, quals, gop, gop,
gcp); gcp, 0, false);
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);
@ -353,11 +287,11 @@ public class PairHMMUnitTest extends BaseTest {
byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length); byte[] mread = Arrays.copyOfRange(haplotype1,offset,haplotype1.length);
// change single base at position k to C. If it's a C, change to T // change single base at position k to C. If it's a C, change to T
mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C'); mread[k] = ( mread[k] == (byte)'C' ? (byte)'T' : (byte)'C');
double res1 = hmm.computeReadLikelihoodGivenHaplotype( originalHMM.initialize(mread.length, haplotype1.length);
double res1 = originalHMM.computeReadLikelihoodGivenHaplotypeLog10(
haplotype1, mread, haplotype1, mread,
quals, gop, gop, quals, gop, gop,
gcp); gcp, 0, false);
System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1); System.out.format("H:%s\nR: %s\n Pos:%d Result:%4.2f\n",new String(haplotype1), new String(mread), k,res1);

View File

@ -111,7 +111,13 @@ gsa.read.gatkreportv1 <- function(lines) {
headerRowCount = -1; headerRowCount = -1;
finishTable <- function() { finishTable <- function() {
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, tableRows[1:rowCount,], tableEnv); if ( rowCount == 1 )
# good I hate R. Work around to avoid collapsing into an unstructured vector when
# there's only 1 row
sub <- t(as.matrix(tableRows[1:rowCount,]))
else
sub <- tableRows[1:rowCount,]
.gsa.assignGATKTableToEnvironment(tableName, tableHeader, sub, tableEnv);
} }
for (line in lines) { for (line in lines) {

View File

@ -1,5 +1,6 @@
library(gplots) library(gplots)
library(ggplot2) library(ggplot2)
library(tools)
# ------------------------------------------------------- # -------------------------------------------------------
# Utilities for displaying multiple plots per page # Utilities for displaying multiple plots per page
@ -59,6 +60,7 @@ closePDF <- function(outputPDF) {
if ( ! is.na(outputPDF) ) { if ( ! is.na(outputPDF) ) {
dev.off() dev.off()
if (exists("compactPDF")) { if (exists("compactPDF")) {
print("compacting PDF")
compactPDF(outputPDF) compactPDF(outputPDF)
} }
} }

View File

@ -245,7 +245,7 @@ public class FastaSequenceIndexBuilder {
* Reset iterators and add contig to sequence index * Reset iterators and add contig to sequence index
*/ */
private void finishReadingContig(FastaSequenceIndex sequenceIndex) { private void finishReadingContig(FastaSequenceIndex sequenceIndex) {
sequenceIndex.add(new FastaSequenceIndexEntry(contig, location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++)); sequenceIndex.add(new FastaSequenceIndexEntry(trimContigName(contig), location, size, (int) basesPerLine, (int) bytesPerLine, thisSequenceIndex++));
status = Status.NONE; status = Status.NONE;
contig = ""; contig = "";
size = 0; size = 0;
@ -258,6 +258,14 @@ public class FastaSequenceIndexBuilder {
} }
} }
/*
* Trims the contig name to the expected value by removing any characters after the first whitespace
*/
private static String trimContigName(final String contigName) {
int whitespaceIndex = contigName.indexOf(' ');
return ( whitespaceIndex == -1 ) ? contigName : contigName.substring(0, whitespaceIndex);
}
/** /**
* Stores FastaSequenceIndex as a .fasta.fai file on local machine * Stores FastaSequenceIndex as a .fasta.fai file on local machine
* Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder * Although method is public it cannot be called on any old FastaSequenceIndex - must be created by a FastaSequenceIndexBuilder

View File

@ -125,6 +125,37 @@ public class GATKBAMFileSpan extends BAMFileSpan {
return size; return size;
} }
/**
* Get a GATKChunk representing the "extent" of this file span, from the start of the first
* chunk to the end of the last chunk.The chunks list must be sorted in order to use this method.
*
* @return a GATKChunk representing the extent of this file span, or a GATKChunk representing
* a span of size 0 if there are no chunks
*/
public GATKChunk getExtent() {
validateSorted(); // TODO: defensive measure: may be unnecessary
List<Chunk> chunks = getChunks();
if ( chunks.isEmpty() ) {
return new GATKChunk(0L, 0L);
}
return new GATKChunk(chunks.get(0).getChunkStart(), chunks.get(chunks.size() - 1).getChunkEnd());
}
/**
* Validates the list of chunks to ensure that they appear in sorted order.
*/
private void validateSorted() {
List<Chunk> chunks = getChunks();
for ( int i = 1; i < chunks.size(); i++ ) {
if ( chunks.get(i).getChunkStart() < chunks.get(i-1).getChunkEnd() ) {
throw new ReviewedStingException(String.format("Chunk list is unsorted; chunk %s is before chunk %s", chunks.get(i-1), chunks.get(i)));
}
}
}
/** /**
* Computes the union of two FileSpans. * Computes the union of two FileSpans.
* @param other FileSpan to union with this one. * @param other FileSpan to union with this one.

View File

@ -31,7 +31,7 @@ import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker; import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.BaseUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
@ -81,7 +81,7 @@ public class AlignmentValidation extends ReadWalker<Integer,Integer> {
* @return Number of reads aligned by this map (aka 1). * @return Number of reads aligned by this map (aka 1).
*/ */
@Override @Override
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) { public Integer map(ReferenceContext ref, GATKSAMRecord read, RefMetaDataTracker metaDataTracker) {
//logger.info(String.format("examining read %s", read.getReadName())); //logger.info(String.format("examining read %s", read.getReadName()));
byte[] bases = read.getReadBases(); byte[] bases = read.getReadBases();

View File

@ -1,139 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment;
import net.sf.picard.reference.ReferenceSequenceFileFactory;
import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.io.StingSAMFileWriter;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.gatk.walkers.WalkerName;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.File;
/**
* Aligns reads to a given reference using Heng Li's BWA aligner, presenting the resulting alignments in SAM or BAM format.
* Mimics the steps 'bwa aln' followed by 'bwa samse' using the BWA/C implementation.
*
* @author mhanna
* @version 0.1
*/
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
@WalkerName("Align")
public class AlignmentWalker extends ReadWalker<Integer,Integer> {
@Argument(fullName="target_reference",shortName="target_ref",doc="The reference to which reads in the source file should be aligned. Alongside this reference should sit index files " +
"generated by bwa index -d bwtsw. If unspecified, will default " +
"to the reference specified via the -R argument.",required=false)
private File targetReferenceFile = null;
@Output
private StingSAMFileWriter out = null;
/**
* The actual aligner.
*/
private BWACAligner aligner = null;
/**
* New header to use, if desired.
*/
private SAMFileHeader header;
/**
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
*/
@Override
public void initialize() {
if(targetReferenceFile == null)
targetReferenceFile = getToolkit().getArguments().referenceFile;
BWTFiles bwtFiles = new BWTFiles(targetReferenceFile.getAbsolutePath());
BWAConfiguration configuration = new BWAConfiguration();
aligner = new BWACAligner(bwtFiles,configuration);
// Take the header of the SAM file, tweak it by adding in the reference dictionary and specifying that the target file is unsorted.
header = getToolkit().getSAMFileHeader().clone();
SAMSequenceDictionary referenceDictionary =
ReferenceSequenceFileFactory.getReferenceSequenceFile(targetReferenceFile).getSequenceDictionary();
header.setSequenceDictionary(referenceDictionary);
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
out.writeHeader(header);
}
/**
* Aligns a read to the given reference.
*
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of alignments found for this read.
*/
@Override
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
SAMRecord alignedRead = aligner.align(read,header);
out.addAlignment(alignedRead);
return 1;
}
/**
* Initial value for reduce. In this case, alignments will be counted.
* @return 0, indicating no alignments yet found.
*/
@Override
public Integer reduceInit() { return 0; }
/**
* Calculates the number of alignments found.
* @param value Number of alignments found by this map.
* @param sum Number of alignments found before this map.
* @return Number of alignments found up to and including this map.
*/
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
/**
* Cleanup.
* @param result Number of reads processed.
*/
@Override
public void onTraversalDone(Integer result) {
aligner.close();
super.onTraversalDone(result);
}
}

View File

@ -1,132 +0,0 @@
/*
* Copyright (c) 2010 The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.alignment;
import org.broadinstitute.sting.alignment.bwa.BWAConfiguration;
import org.broadinstitute.sting.alignment.bwa.BWTFiles;
import org.broadinstitute.sting.alignment.bwa.c.BWACAligner;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Output;
import org.broadinstitute.sting.gatk.CommandLineGATK;
import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker;
import org.broadinstitute.sting.gatk.walkers.ReadWalker;
import org.broadinstitute.sting.utils.help.DocumentedGATKFeature;
import org.broadinstitute.sting.utils.sam.GATKSAMRecord;
import java.io.PrintStream;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
/**
* Counts the number of best alignments as presented by BWA and outputs a histogram of number of placements vs. the
* frequency of that number of placements.
*
* @author mhanna
* @version 0.1
*/
@DocumentedGATKFeature( groupName = "BAM Processing and Analysis Tools", extraDocs = {CommandLineGATK.class} )
public class CountBestAlignments extends ReadWalker<Integer,Integer> {
/**
* The supporting BWT index generated using BWT.
*/
@Argument(fullName="BWTPrefix",shortName="BWT",doc="Index files generated by bwa index -d bwtsw",required=false)
private String prefix = null;
@Output
private PrintStream out = null;
/**
* The actual aligner.
*/
private Aligner aligner = null;
private SortedMap<Integer,Integer> alignmentFrequencies = new TreeMap<Integer,Integer>();
/**
* Create an aligner object. The aligner object will load and hold the BWT until close() is called.
*/
@Override
public void initialize() {
if(prefix == null)
prefix = getToolkit().getArguments().referenceFile.getAbsolutePath();
BWTFiles bwtFiles = new BWTFiles(prefix);
BWAConfiguration configuration = new BWAConfiguration();
aligner = new BWACAligner(bwtFiles,configuration);
}
/**
* Aligns a read to the given reference.
*
* @param ref Reference over the read. Read will most likely be unmapped, so ref will be null.
* @param read Read to align.
* @return Number of alignments found for this read.
*/
@Override
public Integer map(ReferenceContext ref, GATKSAMRecord read, ReadMetaDataTracker metaDataTracker) {
Iterator<Alignment[]> alignmentIterator = aligner.getAllAlignments(read.getReadBases()).iterator();
if(alignmentIterator.hasNext()) {
int numAlignments = alignmentIterator.next().length;
if(alignmentFrequencies.containsKey(numAlignments))
alignmentFrequencies.put(numAlignments,alignmentFrequencies.get(numAlignments)+1);
else
alignmentFrequencies.put(numAlignments,1);
}
return 1;
}
/**
* Initial value for reduce. In this case, validated reads will be counted.
* @return 0, indicating no reads yet validated.
*/
@Override
public Integer reduceInit() { return 0; }
/**
* Calculates the number of reads processed.
* @param value Number of reads processed by this map.
* @param sum Number of reads processed before this map.
* @return Number of reads processed up to and including this map.
*/
@Override
public Integer reduce(Integer value, Integer sum) {
return value + sum;
}
/**
* Cleanup.
* @param result Number of reads processed.
*/
@Override
public void onTraversalDone(Integer result) {
aligner.close();
for(Map.Entry<Integer,Integer> alignmentFrequency: alignmentFrequencies.entrySet())
out.printf("%d\t%d%n", alignmentFrequency.getKey(), alignmentFrequency.getValue());
super.onTraversalDone(result);
}
}

View File

@ -62,7 +62,7 @@ public @interface Argument {
* --help argument is specified. * --help argument is specified.
* @return Doc string associated with this command-line argument. * @return Doc string associated with this command-line argument.
*/ */
String doc(); String doc() default "Undocumented option";
/** /**
* Is this argument required. If true, the command-line argument system will * Is this argument required. If true, the command-line argument system will

View File

@ -46,7 +46,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
/** /**
* Maps indices of command line arguments to values paired with that argument. * Maps indices of command line arguments to values paired with that argument.
*/ */
public final SortedMap<ArgumentMatchSite,List<String>> sites = new TreeMap<ArgumentMatchSite,List<String>>(); public final SortedMap<ArgumentMatchSite,List<ArgumentMatchValue>> sites = new TreeMap<ArgumentMatchSite,List<ArgumentMatchValue>>();
/** /**
* An ordered, freeform collection of tags. * An ordered, freeform collection of tags.
@ -90,11 +90,11 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
* @param value Value for the argument at this position. * @param value Value for the argument at this position.
* @param tags ordered freeform text tags associated with this argument. * @param tags ordered freeform text tags associated with this argument.
*/ */
private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final String value, final Tags tags) { private ArgumentMatch(final String label, final ArgumentDefinition definition, final ArgumentMatchSite site, final ArgumentMatchValue value, final Tags tags) {
this.label = label; this.label = label;
this.definition = definition; this.definition = definition;
ArrayList<String> values = new ArrayList<String>(); ArrayList<ArgumentMatchValue> values = new ArrayList<ArgumentMatchValue>();
if( value != null ) if( value != null )
values.add(value); values.add(value);
sites.put(site,values ); sites.put(site,values );
@ -131,11 +131,11 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
ArgumentMatch transform(Multiplexer multiplexer, Object key) { ArgumentMatch transform(Multiplexer multiplexer, Object key) {
SortedMap<ArgumentMatchSite,List<String>> newIndices = new TreeMap<ArgumentMatchSite,List<String>>(); SortedMap<ArgumentMatchSite,List<ArgumentMatchValue>> newIndices = new TreeMap<ArgumentMatchSite,List<ArgumentMatchValue>>();
for(Map.Entry<ArgumentMatchSite,List<String>> site: sites.entrySet()) { for(Map.Entry<ArgumentMatchSite,List<ArgumentMatchValue>> site: sites.entrySet()) {
List<String> newEntries = new ArrayList<String>(); List<ArgumentMatchValue> newEntries = new ArrayList<ArgumentMatchValue>();
for(String entry: site.getValue()) for(ArgumentMatchValue entry: site.getValue())
newEntries.add(multiplexer.transformArgument(key,entry)); newEntries.add(new ArgumentMatchStringValue(multiplexer.transformArgument(key,entry.asString())));
newIndices.put(site.getKey(),newEntries); newIndices.put(site.getKey(),newEntries);
} }
ArgumentMatch newArgumentMatch = new ArgumentMatch(label,definition); ArgumentMatch newArgumentMatch = new ArgumentMatch(label,definition);
@ -165,7 +165,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
/** /**
* Iterate over each available token. * Iterate over each available token.
*/ */
private Iterator<String> tokenIterator = null; private Iterator<ArgumentMatchValue> tokenIterator = null;
/** /**
* The next site to return. Null if none remain. * The next site to return. Null if none remain.
@ -175,7 +175,7 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
/** /**
* The next token to return. Null if none remain. * The next token to return. Null if none remain.
*/ */
String nextToken = null; ArgumentMatchValue nextToken = null;
{ {
siteIterator = sites.keySet().iterator(); siteIterator = sites.keySet().iterator();
@ -254,9 +254,9 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
* @param site site of the command-line argument to which this value is mated. * @param site site of the command-line argument to which this value is mated.
* @param value Text representation of value to add. * @param value Text representation of value to add.
*/ */
public void addValue( ArgumentMatchSite site, String value ) { public void addValue( ArgumentMatchSite site, ArgumentMatchValue value ) {
if( !sites.containsKey(site) || sites.get(site) == null ) if( !sites.containsKey(site) || sites.get(site) == null )
sites.put(site, new ArrayList<String>() ); sites.put(site, new ArrayList<ArgumentMatchValue>() );
sites.get(site).add(value); sites.get(site).add(value);
} }
@ -275,8 +275,8 @@ public class ArgumentMatch implements Iterable<ArgumentMatch> {
* Return the values associated with this argument match. * Return the values associated with this argument match.
* @return A collection of the string representation of these value. * @return A collection of the string representation of these value.
*/ */
public List<String> values() { public List<ArgumentMatchValue> values() {
List<String> values = new ArrayList<String>(); List<ArgumentMatchValue> values = new ArrayList<ArgumentMatchValue>();
for( ArgumentMatchSite site: sites.keySet() ) { for( ArgumentMatchSite site: sites.keySet() ) {
if( sites.get(site) != null ) if( sites.get(site) != null )
values.addAll(sites.get(site)); values.addAll(sites.get(site));

View File

@ -0,0 +1,27 @@
package org.broadinstitute.sting.commandline;
import java.io.File;
/**
* Holds a reference to a file as an argument match value.
*
* This is useful when the type of the stored file may be a subclass of java.io.File,
* for example a Queue RemoteFile.
*/
public class ArgumentMatchFileValue extends ArgumentMatchValue {
private final File file;
public ArgumentMatchFileValue(File file) {
this.file = file;
}
@Override
public String asString() {
return file == null ? null : file.getAbsolutePath();
}
@Override
public File asFile() {
return file;
}
}

View File

@ -24,38 +24,36 @@
package org.broadinstitute.sting.commandline; package org.broadinstitute.sting.commandline;
import java.io.File;
/** /**
* Where an argument match originated, via the commandline or a file. * Where an argument match originated, via the commandline or a custom provider.
*/ */
public class ArgumentMatchSource implements Comparable<ArgumentMatchSource> { public class ArgumentMatchSource implements Comparable<ArgumentMatchSource> {
public static final ArgumentMatchSource COMMAND_LINE = new ArgumentMatchSource(ArgumentMatchSourceType.CommandLine, null); public static final ArgumentMatchSource COMMAND_LINE = new ArgumentMatchSource(ArgumentMatchSourceType.CommandLine, null);
private final ArgumentMatchSourceType type; private final ArgumentMatchSourceType type;
private final File file; private final String description;
/** /**
* Creates an argument match source from the specified file. * Creates an argument match source from the specified file.
* @param file File specifying the arguments. Must not be null. * @param description Where the arguments originated.
*/ */
public ArgumentMatchSource(File file) { public ArgumentMatchSource(String description) {
this(ArgumentMatchSourceType.File, file); this(ArgumentMatchSourceType.Provider, description);
} }
private ArgumentMatchSource(ArgumentMatchSourceType type, File file) { private ArgumentMatchSource(ArgumentMatchSourceType type, String description) {
if (type == ArgumentMatchSourceType.File && file == null) if (type == ArgumentMatchSourceType.Provider && description == null)
throw new IllegalArgumentException("An argument match source of type File cannot have a null file."); throw new IllegalArgumentException("An argument match source provider cannot have a null description.");
this.type = type; this.type = type;
this.file = file; this.description = description;
} }
public ArgumentMatchSourceType getType() { public ArgumentMatchSourceType getType() {
return type; return type;
} }
public File getFile() { public String getDescription() {
return file; return description;
} }
@Override @Override
@ -65,13 +63,13 @@ public class ArgumentMatchSource implements Comparable<ArgumentMatchSource> {
ArgumentMatchSource that = (ArgumentMatchSource) o; ArgumentMatchSource that = (ArgumentMatchSource) o;
return (type == that.type) && (file == null ? that.file == null : file.equals(that.file)); return (type == that.type) && (description == null ? that.description == null : description.equals(that.description));
} }
@Override @Override
public int hashCode() { public int hashCode() {
int result = type != null ? type.hashCode() : 0; int result = type != null ? type.hashCode() : 0;
result = 31 * result + (file != null ? file.hashCode() : 0); result = 31 * result + (description != null ? description.hashCode() : 0);
return result; return result;
} }
@ -84,15 +82,15 @@ public class ArgumentMatchSource implements Comparable<ArgumentMatchSource> {
if (comp != 0) if (comp != 0)
return comp; return comp;
File f1 = this.file; String d1 = this.description;
File f2 = that.file; String d2 = that.description;
if ((f1 == null) ^ (f2 == null)) { if ((d1 == null) ^ (d2 == null)) {
// If one of the files is null and the other is not // If one of the descriptions is null and the other is not
// put the null file first // put the null description first
return f1 == null ? -1 : 1; return d1 == null ? -1 : 1;
} }
return f1 == null ? 0 : f1.compareTo(f2); return d1 == null ? 0 : d1.compareTo(d2);
} }
} }

View File

@ -25,8 +25,8 @@
package org.broadinstitute.sting.commandline; package org.broadinstitute.sting.commandline;
/** /**
* Type of where an argument match originated, via the commandline or a file. * Type of where an argument match originated, via the commandline or a some other provider.
*/ */
public enum ArgumentMatchSourceType { public enum ArgumentMatchSourceType {
CommandLine, File CommandLine, Provider
} }

View File

@ -0,0 +1,24 @@
package org.broadinstitute.sting.commandline;
import java.io.File;
/**
* Argument values that originated from a string.
*/
public class ArgumentMatchStringValue extends ArgumentMatchValue {
private final String value;
public ArgumentMatchStringValue(String value) {
this.value = value;
}
@Override
public String asString() {
return value;
}
@Override
public File asFile() {
return value == null ? null : new File(value);
}
}

View File

@ -0,0 +1,18 @@
package org.broadinstitute.sting.commandline;
import java.io.File;
/**
* Returns argument values as either strings or values.
*/
public abstract class ArgumentMatchValue {
/**
* @return the value of this argument as a String object.
*/
public abstract String asString();
/**
* @return the value of this argument as a File object.
*/
public abstract File asFile();
}

View File

@ -215,8 +215,8 @@ public abstract class ArgumentTypeDescriptor {
* @param matches The matches for the given argument. * @param matches The matches for the given argument.
* @return The value of the argument if available, or null if not present. * @return The value of the argument if available, or null if not present.
*/ */
protected String getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) { protected ArgumentMatchValue getArgumentValue( ArgumentDefinition definition, ArgumentMatches matches ) {
Collection<String> argumentValues = getArgumentValues( definition, matches ); Collection<ArgumentMatchValue> argumentValues = getArgumentValues( definition, matches );
if( argumentValues.size() > 1 ) if( argumentValues.size() > 1 )
throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName); throw new UserException.CommandLineException("Multiple values associated with given definition, but this argument expects only one: " + definition.fullName);
return argumentValues.size() > 0 ? argumentValues.iterator().next() : null; return argumentValues.size() > 0 ? argumentValues.iterator().next() : null;
@ -244,8 +244,8 @@ public abstract class ArgumentTypeDescriptor {
* @param matches The matches for the given argument. * @param matches The matches for the given argument.
* @return The value of the argument if available, or an empty collection if not present. * @return The value of the argument if available, or an empty collection if not present.
*/ */
protected Collection<String> getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) { protected Collection<ArgumentMatchValue> getArgumentValues( ArgumentDefinition definition, ArgumentMatches matches ) {
Collection<String> values = new ArrayList<String>(); Collection<ArgumentMatchValue> values = new ArrayList<ArgumentMatchValue>();
for( ArgumentMatch match: matches ) { for( ArgumentMatch match: matches ) {
if( match.definition.equals(definition) ) if( match.definition.equals(definition) )
values.addAll(match.values()); values.addAll(match.values());
@ -310,7 +310,7 @@ public abstract class ArgumentTypeDescriptor {
*/ */
protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) { protected Object parseBinding(ArgumentSource source, Type type, ArgumentMatches matches, Tags tags) {
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
String value = getArgumentValue(defaultDefinition, matches); ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches);
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
Class<? extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type); Class<? extends Feature> parameterType = JVMUtils.getParameterizedTypeClass(type);
String name = defaultDefinition.fullName; String name = defaultDefinition.fullName;
@ -328,7 +328,7 @@ public abstract class ArgumentTypeDescriptor {
* @param fieldName The name of the field that was parsed. Used for error reporting. * @param fieldName The name of the field that was parsed. Used for error reporting.
* @return The newly created binding object of type bindingClass. * @return The newly created binding object of type bindingClass.
*/ */
public static Object parseBinding(String value, Class<? extends Feature> parameterType, Type bindingClass, public static Object parseBinding(ArgumentMatchValue value, Class<? extends Feature> parameterType, Type bindingClass,
String bindingName, Tags tags, String fieldName) { String bindingName, Tags tags, String fieldName) {
try { try {
String tribbleType = null; String tribbleType = null;
@ -337,7 +337,7 @@ public abstract class ArgumentTypeDescriptor {
throw new UserException.CommandLineException( throw new UserException.CommandLineException(
String.format("Unexpected number of positional tags for argument %s : %s. " + String.format("Unexpected number of positional tags for argument %s : %s. " +
"Rod bindings only support -X:type and -X:name,type argument styles", "Rod bindings only support -X:type and -X:name,type argument styles",
value, fieldName)); value.asString(), fieldName));
} else if ( tags.getPositionalTags().size() == 2 ) { } else if ( tags.getPositionalTags().size() == 2 ) {
// -X:name,type style // -X:name,type style
bindingName = tags.getPositionalTags().get(0); bindingName = tags.getPositionalTags().get(0);
@ -366,7 +366,7 @@ public abstract class ArgumentTypeDescriptor {
if ( tribbleType == null ) { if ( tribbleType == null ) {
// try to determine the file type dynamically // try to determine the file type dynamically
File file = new File(value); File file = value.asFile();
if ( file.canRead() && file.isFile() ) { if ( file.canRead() && file.isFile() ) {
FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file); FeatureManager.FeatureDescriptor featureDescriptor = manager.getByFiletype(file);
if ( featureDescriptor != null ) { if ( featureDescriptor != null ) {
@ -379,7 +379,7 @@ public abstract class ArgumentTypeDescriptor {
// IntervalBinding can be created from a normal String // IntervalBinding can be created from a normal String
Class rawType = (makeRawTypeIfNecessary(bindingClass)); Class rawType = (makeRawTypeIfNecessary(bindingClass));
try { try {
return rawType.getConstructor(String.class).newInstance(value); return rawType.getConstructor(String.class).newInstance(value.asString());
} catch (NoSuchMethodException e) { } catch (NoSuchMethodException e) {
/* ignore */ /* ignore */
} }
@ -399,7 +399,7 @@ public abstract class ArgumentTypeDescriptor {
} }
Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class); Constructor ctor = (makeRawTypeIfNecessary(bindingClass)).getConstructor(Class.class, String.class, String.class, String.class, Tags.class);
return ctor.newInstance(parameterType, bindingName, value, tribbleType, tags); return ctor.newInstance(parameterType, bindingName, value.asString(), tribbleType, tags);
} catch (Exception e) { } catch (Exception e) {
if ( e instanceof UserException ) if ( e instanceof UserException )
throw ((UserException)e); throw ((UserException)e);
@ -517,7 +517,7 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
return true; return true;
ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source); ArgumentDefinition defaultDefinition = createDefaultArgumentDefinition(source);
String value = getArgumentValue( defaultDefinition, matches ); ArgumentMatchValue value = getArgumentValue(defaultDefinition, matches);
Object result; Object result;
Tags tags = getArgumentTags(matches); Tags tags = getArgumentTags(matches);
@ -527,12 +527,12 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class); Method valueOf = primitiveToWrapperMap.get(type).getMethod("valueOf",String.class);
if(value == null) if(value == null)
throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); throw new MissingArgumentValueException(createDefaultArgumentDefinition(source));
result = valueOf.invoke(null,value.trim()); result = valueOf.invoke(null,value.asString().trim());
} else if (type.isEnum()) { } else if (type.isEnum()) {
Object[] vals = type.getEnumConstants(); Object[] vals = type.getEnumConstants();
Object defaultEnumeration = null; // as we look at options, record the default option if it exists Object defaultEnumeration = null; // as we look at options, record the default option if it exists
for (Object val : vals) { for (Object val : vals) {
if (String.valueOf(val).equalsIgnoreCase(value)) return val; if (String.valueOf(val).equalsIgnoreCase(value == null ? null : value.asString())) return val;
try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; } try { if (type.getField(val.toString()).isAnnotationPresent(EnumerationArgumentDefault.class)) defaultEnumeration = val; }
catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); } catch (NoSuchFieldException e) { throw new ReviewedStingException("parsing " + type.toString() + "doesn't contain the field " + val.toString()); }
} }
@ -544,10 +544,12 @@ class SimpleArgumentTypeDescriptor extends ArgumentTypeDescriptor {
else if (value == null) else if (value == null)
throw new MissingArgumentValueException(createDefaultArgumentDefinition(source)); throw new MissingArgumentValueException(createDefaultArgumentDefinition(source));
else else
throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value); throw new UnknownEnumeratedValueException(createDefaultArgumentDefinition(source),value.asString());
} else if (type.equals(File.class)) {
result = value == null ? null : value.asFile();
} else { } else {
Constructor ctor = type.getConstructor(String.class); Constructor ctor = type.getConstructor(String.class);
result = ctor.newInstance(value); result = ctor.newInstance(value == null ? null : value.asString());
} }
} catch (UserException e) { } catch (UserException e) {
throw e; throw e;

View File

@ -174,7 +174,7 @@ public abstract class CommandLineProgram {
ParsingEngine parser = clp.parser = new ParsingEngine(clp); ParsingEngine parser = clp.parser = new ParsingEngine(clp);
parser.addArgumentSource(clp.getClass()); parser.addArgumentSource(clp.getClass());
Map<ArgumentMatchSource, List<String>> parsedArgs; Map<ArgumentMatchSource, ParsedArgs> parsedArgs;
// process the args // process the args
if (clp.canAddArgumentsDynamically()) { if (clp.canAddArgumentsDynamically()) {

View File

@ -0,0 +1,13 @@
package org.broadinstitute.sting.commandline;
/**
* Represents a collection of parsed arguments for an argument source.
*
* Useful for printing out help documents.
*/
public abstract class ParsedArgs {
/**
* @return A compact description of the arguments from an provider/source.
*/
public abstract String getDescription();
}

View File

@ -0,0 +1,30 @@
package org.broadinstitute.sting.commandline;
import org.apache.commons.lang.StringUtils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* A list of string arguments, usually from the command line or an args list file.
*/
public class ParsedListArgs extends ParsedArgs {
private final List<String> args = new ArrayList<String>();
public ParsedListArgs() {
}
public ParsedListArgs(List<String> args) {
this.args.addAll(args);
}
public void add(String... args) {
this.args.addAll(Arrays.asList(args));
}
@Override
public String getDescription() {
return StringUtils.join(this.args, " ");
}
}

View File

@ -30,6 +30,7 @@ import org.apache.commons.io.FileUtils;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
import org.broadinstitute.sting.utils.classloader.JVMUtils; import org.broadinstitute.sting.utils.classloader.JVMUtils;
import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
@ -61,7 +62,7 @@ public class ParsingEngine {
* Indicates as best as possible where command-line text remains unmatched * Indicates as best as possible where command-line text remains unmatched
* to existing arguments. * to existing arguments.
*/ */
ArgumentMatches argumentMatches = null; private ArgumentMatches argumentMatches = null;
/** /**
* Techniques for parsing and for argument lookup. * Techniques for parsing and for argument lookup.
@ -88,7 +89,10 @@ public class ParsingEngine {
/** /**
* List of tags associated with the given instantiation of the command-line argument. * List of tags associated with the given instantiation of the command-line argument.
*/ */
private final Map<Object,Tags> tags = new IdentityHashMap<Object,Tags>(); private final Map<Object,Tags> tags = new IdentityHashMap<Object,Tags>();
private PluginManager<ParsingEngineArgumentProvider> argumentProviderPluginManager =
new PluginManager<ParsingEngineArgumentProvider>(ParsingEngineArgumentProvider.class);
/** /**
* our log, which we want to capture anything from org.broadinstitute.sting * our log, which we want to capture anything from org.broadinstitute.sting
@ -105,7 +109,10 @@ public class ParsingEngine {
argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors()); argumentTypeDescriptors.addAll(clp.getArgumentTypeDescriptors());
argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS); argumentTypeDescriptors.addAll(STANDARD_ARGUMENT_TYPE_DESCRIPTORS);
addArgumentSource(ParsingEngineArgumentFiles.class); List<Class<? extends ParsingEngineArgumentProvider>> providers = argumentProviderPluginManager.getPlugins();
for (Class<? extends ParsingEngineArgumentProvider> provider: providers) {
addArgumentSource(provider);
}
} }
/** /**
@ -117,6 +124,10 @@ public class ParsingEngine {
addArgumentSource(null, source); addArgumentSource(null, source);
} }
public ArgumentMatches getArgumentMatches() {
return argumentMatches;
}
/** /**
* Add an argument source. Argument sources are expected to have * Add an argument source. Argument sources are expected to have
* any number of fields with an @Argument annotation attached. * any number of fields with an @Argument annotation attached.
@ -156,29 +167,30 @@ public class ParsingEngine {
* @param tokens Tokens passed on the command line. * @param tokens Tokens passed on the command line.
* @return The parsed arguments by file. * @return The parsed arguments by file.
*/ */
public SortedMap<ArgumentMatchSource, List<String>> parse( String[] tokens ) { public SortedMap<ArgumentMatchSource, ParsedArgs> parse( String[] tokens ) {
argumentMatches = new ArgumentMatches(); argumentMatches = new ArgumentMatches();
SortedMap<ArgumentMatchSource, List<String>> parsedArgs = new TreeMap<ArgumentMatchSource, List<String>>(); SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs = new TreeMap<ArgumentMatchSource, ParsedArgs>();
List<String> cmdLineTokens = Arrays.asList(tokens); List<String> cmdLineTokens = Arrays.asList(tokens);
parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs); parse(ArgumentMatchSource.COMMAND_LINE, cmdLineTokens, argumentMatches, parsedArgs);
ParsingEngineArgumentFiles argumentFiles = new ParsingEngineArgumentFiles(); List<ParsingEngineArgumentProvider> providers = argumentProviderPluginManager.createAllTypes();
// Load the arguments ONLY into the argument files. for (ParsingEngineArgumentProvider provider: providers) {
// Validation may optionally run on the rest of the arguments. // Load the arguments ONLY into the provider.
loadArgumentsIntoObject(argumentFiles); // Validation may optionally run on the rest of the arguments.
loadArgumentsIntoObject(provider);
}
for (File file: argumentFiles.files) { for (ParsingEngineArgumentProvider provider: providers) {
List<String> fileTokens = getArguments(file); provider.parse(this, parsedArgs);
parse(new ArgumentMatchSource(file), fileTokens, argumentMatches, parsedArgs);
} }
return parsedArgs; return parsedArgs;
} }
private void parse(ArgumentMatchSource matchSource, List<String> tokens, public void parse(ArgumentMatchSource matchSource, List<String> tokens,
ArgumentMatches argumentMatches, SortedMap<ArgumentMatchSource, List<String>> parsedArgs) { ArgumentMatches argumentMatches, SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs) {
ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1); ArgumentMatchSite lastArgumentMatchSite = new ArgumentMatchSite(matchSource, -1);
int i = 0; int i = 0;
@ -195,19 +207,44 @@ public class ParsingEngine {
} }
else { else {
if( argumentMatches.hasMatch(lastArgumentMatchSite) && if( argumentMatches.hasMatch(lastArgumentMatchSite) &&
!argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite)) !argumentMatches.getMatch(lastArgumentMatchSite).hasValueAtSite(lastArgumentMatchSite))
argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, token ); argumentMatches.getMatch(lastArgumentMatchSite).addValue( lastArgumentMatchSite, new ArgumentMatchStringValue(token) );
else else
argumentMatches.MissingArgument.addValue( site, token ); argumentMatches.MissingArgument.addValue( site, new ArgumentMatchStringValue(token) );
} }
i++; i++;
} }
parsedArgs.put(matchSource, tokens); parsedArgs.put(matchSource, new ParsedListArgs(tokens));
} }
private List<String> getArguments(File file) { public void parsePairs(ArgumentMatchSource matchSource, List<Pair<String, ArgumentMatchValue>> tokens,
ArgumentMatches argumentMatches, ParsedArgs matchSourceArgs,
SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs) {
int i = 0;
for (Pair<String, ArgumentMatchValue> pair: tokens) {
ArgumentMatchSite site = new ArgumentMatchSite(matchSource, i);
List<DefinitionMatcher> matchers = Arrays.asList(ArgumentDefinitions.FullNameDefinitionMatcher, ArgumentDefinitions.ShortNameDefinitionMatcher);
ArgumentDefinition definition = null;
for (DefinitionMatcher matcher: matchers) {
definition = argumentDefinitions.findArgumentDefinition( pair.getFirst(), matcher );
if (definition != null)
break;
}
if (definition == null)
continue;
ArgumentMatch argumentMatch = new ArgumentMatch(pair.getFirst(), definition, site, new Tags());
argumentMatches.mergeInto(argumentMatch);
argumentMatch.addValue(site, pair.getSecond());
i++;
}
parsedArgs.put(matchSource, matchSourceArgs);
}
protected List<String> getArguments(File file) {
try { try {
if (file.getAbsolutePath().endsWith(".list")) { if (file.getAbsolutePath().endsWith(".list")) {
return getListArguments(file); return getListArguments(file);
@ -283,9 +320,9 @@ public class ParsingEngine {
// Ensure that the field contents meet the validation criteria specified by the regular expression. // Ensure that the field contents meet the validation criteria specified by the regular expression.
for( ArgumentMatch verifiableMatch: verifiableMatches ) { for( ArgumentMatch verifiableMatch: verifiableMatches ) {
for( String value: verifiableMatch.values() ) { for( ArgumentMatchValue value: verifiableMatch.values() ) {
if( verifiableArgument.validation != null && !value.matches(verifiableArgument.validation) ) if( verifiableArgument.validation != null && !value.asString().matches(verifiableArgument.validation) )
invalidValues.add( new Pair<ArgumentDefinition,String>(verifiableArgument, value) ); invalidValues.add( new Pair<ArgumentDefinition,String>(verifiableArgument, value.asString()) );
} }
} }
} }
@ -629,21 +666,21 @@ class UnmatchedArgumentException extends ArgumentException {
private static String formatArguments( ArgumentMatch invalidValues ) { private static String formatArguments( ArgumentMatch invalidValues ) {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
for( ArgumentMatchSite site: invalidValues.sites.keySet() ) for( ArgumentMatchSite site: invalidValues.sites.keySet() )
for( String value: invalidValues.sites.get(site) ) { for( ArgumentMatchValue value: invalidValues.sites.get(site) ) {
switch (site.getSource().getType()) { switch (site.getSource().getType()) {
case CommandLine: case CommandLine:
sb.append( String.format("%nInvalid argument value '%s' at position %d.", sb.append( String.format("%nInvalid argument value '%s' at position %d.",
value, site.getIndex()) ); value.asString(), site.getIndex()) );
break; break;
case File: case Provider:
sb.append( String.format("%nInvalid argument value '%s' in file %s at position %d.", sb.append( String.format("%nInvalid argument value '%s' in %s at position %d.",
value, site.getSource().getFile().getAbsolutePath(), site.getIndex()) ); value.asString(), site.getSource().getDescription(), site.getIndex()) );
break; break;
default: default:
throw new RuntimeException( String.format("Unexpected argument match source type: %s", throw new RuntimeException( String.format("Unexpected argument match source type: %s",
site.getSource().getType())); site.getSource().getType()));
} }
if(value != null && Utils.dupString(' ',value.length()).equals(value)) if(value.asString() != null && Utils.dupString(' ',value.asString().length()).equals(value.asString()))
sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace."); sb.append(" Please make sure any line continuation backslashes on your command line are not followed by whitespace.");
} }
return sb.toString(); return sb.toString();
@ -696,12 +733,3 @@ class UnknownEnumeratedValueException extends ArgumentException {
return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions)); return String.format("Invalid value %s specified for argument %s; valid options are (%s).", argumentPassed, definition.fullName, Utils.join(",",definition.validOptions));
} }
} }
/**
* Container class to store the list of argument files.
* The files will be parsed after the command line arguments.
*/
class ParsingEngineArgumentFiles {
@Argument(fullName = "arg_file", shortName = "args", doc = "Reads arguments from the specified file", required = false)
public List<File> files = new ArrayList<File>();
}

View File

@ -0,0 +1,30 @@
package org.broadinstitute.sting.commandline;
import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.SortedMap;
/**
* Container class to store the list of argument files.
* The files will be parsed after the command line arguments.
*/
public class ParsingEngineArgumentFiles extends ParsingEngineArgumentProvider {
@Argument(fullName = "arg_file", shortName = "args", doc = "Reads arguments from the specified file", required = false)
public List<File> files = new ArrayList<File>();
@Override
public void parse(ParsingEngine parsingEngine, SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs) {
ArgumentMatches argumentMatches = parsingEngine.getArgumentMatches();
for (File file: this.files) {
List<String> fileTokens = parsingEngine.getArguments(file);
parsingEngine.parse(new ArgumentMatchFileSource(file), fileTokens, argumentMatches, parsedArgs);
}
}
}
class ArgumentMatchFileSource extends ArgumentMatchSource {
ArgumentMatchFileSource(File file) {
super("file " + file.getAbsolutePath());
}
}

View File

@ -0,0 +1,12 @@
package org.broadinstitute.sting.commandline;
import java.util.List;
import java.util.SortedMap;
/**
* A class that can parse arguments for the engine
*/
public abstract class ParsingEngineArgumentProvider {
public abstract void parse(ParsingEngine parsingEngine, SortedMap<ArgumentMatchSource, ParsedArgs> parsedArgs);
}

View File

@ -117,6 +117,15 @@ public final class RodBinding<T extends Feature> {
this.bound = true; this.bound = true;
} }
/**
* For testing purposes only. Creates a RodBinding sufficient for looking up associations to rawName
* @param type
* @param rawName
*/
public RodBinding(Class<T> type, final String rawName) {
this(type, rawName, "missing", type.getSimpleName(), new Tags());
}
/** /**
* Make an unbound RodBinding<T>. Only available for creating the globally unique UNBOUND object * Make an unbound RodBinding<T>. Only available for creating the globally unique UNBOUND object
* @param type class this unbound RodBinding creates * @param type class this unbound RodBinding creates

View File

@ -112,31 +112,38 @@ public class CommandLineGATK extends CommandLineExecutable {
} }
} }
protected static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file"; public static final String PICARD_TEXT_SAM_FILE_ERROR_1 = "Cannot use index file with textual SAM file";
protected static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files"; public static final String PICARD_TEXT_SAM_FILE_ERROR_2 = "Cannot retrieve file pointers within SAM text files";
public static final String NO_SPACE_LEFT_ON_DEVICE_ERROR = "No space left on device";
public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded";
private static void checkForMaskedUserErrors(final Throwable t) { private static void checkForMaskedUserErrors(final Throwable t) {
final String message = t.getMessage(); final String message = t.getMessage();
if ( message == null ) if ( message == null )
return; return;
// we know what to do about the common "Too many open files" error // we know what to do about the common "Too many open files" error
if ( message.indexOf("Too many open files") != -1 ) if ( message.contains("Too many open files") )
exitSystemWithUserError(new UserException.TooManyOpenFiles()); exitSystemWithUserError(new UserException.TooManyOpenFiles());
// malformed BAM looks like a SAM file // malformed BAM looks like a SAM file
if ( message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_1) != -1 || if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) ||
message.indexOf(PICARD_TEXT_SAM_FILE_ERROR_2) != -1 ) message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) )
exitSystemWithSamError(t); exitSystemWithSamError(t);
// can't close tribble index when writing // can't close tribble index when writing
if ( message.indexOf("Unable to close index for") != -1 ) if ( message.contains("Unable to close index for") )
exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage())); exitSystemWithUserError(new UserException(t.getCause() == null ? message : t.getCause().getMessage()));
// disk is full // disk is full
if ( message.indexOf("No space left on device") != -1 ) if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) )
exitSystemWithUserError(new UserException(t.getMessage())); exitSystemWithUserError(new UserException.NoSpaceOnDevice());
if ( t.getCause() != null && t.getCause().getMessage().indexOf("No space left on device") != -1 ) if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) )
exitSystemWithUserError(new UserException(t.getCause().getMessage())); exitSystemWithUserError(new UserException.NoSpaceOnDevice());
// masked out of memory error
if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError )
exitSystemWithUserError(new UserException.NotEnoughMemory());
} }
/** /**

View File

@ -1,52 +0,0 @@
package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Describes the method for downsampling reads at a given locus.
*
* @author hanna
* @version 0.1
*/
public class DownsamplingMethod {
/**
* Type of downsampling to perform.
*/
public final DownsampleType type;
/**
* Actual downsampling target is specified as an integer number of reads.
*/
public final Integer toCoverage;
/**
* Actual downsampling target is specified as a fraction of total available reads.
*/
public final Double toFraction;
/**
* Expresses no downsampling applied at all.
*/
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null);
public DownsamplingMethod(DownsampleType type, Integer toCoverage, Double toFraction) {
// Do some basic sanity checks on the downsampling parameters passed in.
// Can't leave toFraction and toCoverage null unless type is experimental naive duplicate eliminator.
if(type != DownsampleType.NONE && toFraction == null && toCoverage == null)
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
// Fraction and coverage cannot both be specified.
if(toFraction != null && toCoverage != null)
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
// Experimental by sample downsampling does not work with a fraction of reads.
if(type == DownsampleType.BY_SAMPLE && toFraction != null)
throw new UserException.CommandLineException("Cannot downsample to fraction with new EXPERIMENTAL_BY_SAMPLE method");
this.type = type;
this.toCoverage = toCoverage;
this.toFraction = toFraction;
}
}

View File

@ -24,25 +24,28 @@
package org.broadinstitute.sting.gatk; package org.broadinstitute.sting.gatk;
import com.google.java.contract.Ensures;
import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.reference.ReferenceSequenceFile; import net.sf.picard.reference.ReferenceSequenceFile;
import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import net.sf.samtools.SAMSequenceDictionary; import net.sf.samtools.SAMSequenceDictionary;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broad.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.datasources.reads.*;
import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource; import org.broadinstitute.sting.gatk.datasources.reference.ReferenceDataSource;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.executive.MicroScheduler; import org.broadinstitute.sting.gatk.executive.MicroScheduler;
import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter; import org.broadinstitute.sting.gatk.filters.ReadGroupBlackListFilter;
import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker;
import org.broadinstitute.sting.gatk.io.stubs.Stub; import org.broadinstitute.sting.gatk.io.stubs.Stub;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.iterators.ReadTransformersMode;
import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder; import org.broadinstitute.sting.gatk.refdata.tracks.RMDTrackBuilder;
import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
@ -50,21 +53,18 @@ import org.broadinstitute.sting.gatk.samples.SampleDB;
import org.broadinstitute.sting.gatk.samples.SampleDBBuilder; import org.broadinstitute.sting.gatk.samples.SampleDBBuilder;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; import org.broadinstitute.sting.utils.classloader.GATKLiteUtils;
import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.codecs.vcf.VCFHeader;
import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.collections.Pair;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.interval.IntervalUtils;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration; import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.variantcontext.GenotypeBuilder; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor;
import java.io.File; import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.*; import java.util.*;
import java.util.concurrent.TimeUnit;
/** /**
* A GenomeAnalysisEngine that runs a specified walker. * A GenomeAnalysisEngine that runs a specified walker.
@ -74,6 +74,7 @@ public class GenomeAnalysisEngine {
* our log, which we want to capture anything from this class * our log, which we want to capture anything from this class
*/ */
private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class); private static Logger logger = Logger.getLogger(GenomeAnalysisEngine.class);
public static final long NO_RUNTIME_LIMIT = -1;
/** /**
* The GATK command-line argument parsing code. * The GATK command-line argument parsing code.
@ -136,11 +137,18 @@ public class GenomeAnalysisEngine {
*/ */
private Collection<ReadFilter> filters; private Collection<ReadFilter> filters;
/**
* Collection of the read transformers applied to the reads
*/
private List<ReadTransformer> readTransformers;
/** /**
* Controls the allocation of threads between CPU vs IO. * Controls the allocation of threads between CPU vs IO.
*/ */
private ThreadAllocation threadAllocation; private ThreadAllocation threadAllocation;
private ReadMetrics cumulativeMetrics = null;
/** /**
* A currently hacky unique name for this GATK instance * A currently hacky unique name for this GATK instance
*/ */
@ -175,6 +183,13 @@ public class GenomeAnalysisEngine {
*/ */
private Collection<RMDTriplet> referenceMetaDataFiles; private Collection<RMDTriplet> referenceMetaDataFiles;
/**
* The threading efficiency monitor we use in the GATK to monitor our efficiency.
*
* May be null if one isn't active, or hasn't be initialized yet
*/
private ThreadEfficiencyMonitor threadEfficiencyMonitor = null;
/** /**
* Set the reference metadata files to use for this traversal. * Set the reference metadata files to use for this traversal.
* @param referenceMetaDataFiles Collection of files and descriptors over which to traverse. * @param referenceMetaDataFiles Collection of files and descriptors over which to traverse.
@ -252,6 +267,7 @@ public class GenomeAnalysisEngine {
// our microscheduler, which is in charge of running everything // our microscheduler, which is in charge of running everything
MicroScheduler microScheduler = createMicroscheduler(); MicroScheduler microScheduler = createMicroscheduler();
threadEfficiencyMonitor = microScheduler.getThreadEfficiencyMonitor();
// create temp directories as necessary // create temp directories as necessary
initializeTempDirectory(); initializeTempDirectory();
@ -280,6 +296,8 @@ public class GenomeAnalysisEngine {
static { static {
deprecatedGATKWalkers.put("CountCovariates", "2.0"); deprecatedGATKWalkers.put("CountCovariates", "2.0");
deprecatedGATKWalkers.put("TableRecalibration", "2.0"); deprecatedGATKWalkers.put("TableRecalibration", "2.0");
deprecatedGATKWalkers.put("AlignmentWalker", "2.2");
deprecatedGATKWalkers.put("CountBestAlignments", "2.2");
} }
/** /**
@ -349,32 +367,59 @@ public class GenomeAnalysisEngine {
return Collections.unmodifiableList(filters); return Collections.unmodifiableList(filters);
} }
/**
* Returns a list of active, initialized read transformers
*
* @param walker the walker we need to apply read transformers too
* @return a non-null list of read transformers
*/
public void initializeReadTransformers(final Walker walker) {
final List<ReadTransformer> activeTransformers = new ArrayList<ReadTransformer>();
final ReadTransformersMode overrideMode = WalkerManager.getWalkerAnnotation(walker, ReadTransformersMode.class);
final ReadTransformer.ApplicationTime overrideTime = overrideMode != null ? overrideMode.ApplicationTime() : null;
final PluginManager<ReadTransformer> pluginManager = new PluginManager<ReadTransformer>(ReadTransformer.class);
for ( final ReadTransformer transformer : pluginManager.createAllTypes() ) {
transformer.initialize(overrideTime, this, walker);
if ( transformer.enabled() )
activeTransformers.add(transformer);
}
setReadTransformers(activeTransformers);
}
public List<ReadTransformer> getReadTransformers() {
return readTransformers;
}
private void setReadTransformers(final List<ReadTransformer> readTransformers) {
if ( readTransformers == null )
throw new ReviewedStingException("read transformers cannot be null");
this.readTransformers = readTransformers;
}
/** /**
* Parse out the thread allocation from the given command-line argument. * Parse out the thread allocation from the given command-line argument.
*/ */
private void determineThreadAllocation() { private void determineThreadAllocation() {
Tags tags = parsingEngine.getTags(argCollection.numberOfThreads); if ( argCollection.numberOfDataThreads < 1 ) throw new UserException.BadArgumentValue("num_threads", "cannot be less than 1, but saw " + argCollection.numberOfDataThreads);
if ( argCollection.numberOfCPUThreadsPerDataThread < 1 ) throw new UserException.BadArgumentValue("num_cpu_threads", "cannot be less than 1, but saw " + argCollection.numberOfCPUThreadsPerDataThread);
if ( argCollection.numberOfIOThreads < 0 ) throw new UserException.BadArgumentValue("num_io_threads", "cannot be less than 0, but saw " + argCollection.numberOfIOThreads);
// TODO: Kill this complicated logic once Queue supports arbitrary tagged parameters. this.threadAllocation = new ThreadAllocation(argCollection.numberOfDataThreads,
Integer numCPUThreads = null; argCollection.numberOfCPUThreadsPerDataThread,
if(tags.containsKey("cpu") && argCollection.numberOfCPUThreads != null) argCollection.numberOfIOThreads,
throw new UserException("Number of CPU threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other."); argCollection.monitorThreadEfficiency);
else if(tags.containsKey("cpu"))
numCPUThreads = Integer.parseInt(tags.getValue("cpu"));
else if(argCollection.numberOfCPUThreads != null)
numCPUThreads = argCollection.numberOfCPUThreads;
Integer numIOThreads = null;
if(tags.containsKey("io") && argCollection.numberOfIOThreads != null)
throw new UserException("Number of IO threads specified both directly on the command-line and as a tag to the nt argument. Please specify only one or the other.");
else if(tags.containsKey("io"))
numIOThreads = Integer.parseInt(tags.getValue("io"));
else if(argCollection.numberOfIOThreads != null)
numIOThreads = argCollection.numberOfIOThreads;
this.threadAllocation = new ThreadAllocation(argCollection.numberOfThreads,numCPUThreads,numIOThreads);
} }
public int getTotalNumberOfThreads() {
return this.threadAllocation == null ? 1 : threadAllocation.getTotalNumThreads();
}
/** /**
* Allow subclasses and others within this package direct access to the walker manager. * Allow subclasses and others within this package direct access to the walker manager.
* @return The walker manager used by this package. * @return The walker manager used by this package.
@ -400,23 +445,19 @@ public class GenomeAnalysisEngine {
protected DownsamplingMethod getDownsamplingMethod() { protected DownsamplingMethod getDownsamplingMethod() {
GATKArgumentCollection argCollection = this.getArguments(); GATKArgumentCollection argCollection = this.getArguments();
DownsamplingMethod method; boolean useExperimentalDownsampling = argCollection.enableExperimentalDownsampling;
if(argCollection.getDownsamplingMethod() != null)
method = argCollection.getDownsamplingMethod(); DownsamplingMethod commandLineMethod = argCollection.getDownsamplingMethod();
else if(WalkerManager.getDownsamplingMethod(walker) != null) DownsamplingMethod walkerMethod = WalkerManager.getDownsamplingMethod(walker, useExperimentalDownsampling);
method = WalkerManager.getDownsamplingMethod(walker); DownsamplingMethod defaultMethod = DownsamplingMethod.getDefaultDownsamplingMethod(walker, useExperimentalDownsampling);
else
method = GATKArgumentCollection.getDefaultDownsamplingMethod(); return commandLineMethod != null ? commandLineMethod : (walkerMethod != null ? walkerMethod : defaultMethod);
return method;
} }
protected void setDownsamplingMethod(DownsamplingMethod method) { protected void setDownsamplingMethod(DownsamplingMethod method) {
argCollection.setDownsamplingMethod(method); argCollection.setDownsamplingMethod(method);
} }
public BAQ.QualityMode getWalkerBAQQualityMode() { return WalkerManager.getBAQQualityMode(walker); }
public BAQ.ApplicationTime getWalkerBAQApplicationTime() { return WalkerManager.getBAQApplicationTime(walker); }
protected boolean includeReadsWithDeletionAtLoci() { protected boolean includeReadsWithDeletionAtLoci() {
return walker.includeReadsWithDeletionAtLoci(); return walker.includeReadsWithDeletionAtLoci();
} }
@ -504,6 +545,7 @@ public class GenomeAnalysisEngine {
*/ */
protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) { protected Iterable<Shard> getShardStrategy(SAMDataSource readsDataSource, ReferenceSequenceFile drivingDataSource, GenomeLocSortedSet intervals) {
ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null); ValidationExclusion exclusions = (readsDataSource != null ? readsDataSource.getReadsInfo().getValidationExclusionList() : null);
DownsamplingMethod downsamplingMethod = readsDataSource != null ? readsDataSource.getReadsInfo().getDownsamplingMethod() : null;
ReferenceDataSource referenceDataSource = this.getReferenceDataSource(); ReferenceDataSource referenceDataSource = this.getReferenceDataSource();
// If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition. // If reads are present, assume that accessing the reads is always the dominant factor and shard based on that supposition.
@ -538,10 +580,15 @@ public class GenomeAnalysisEngine {
throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals."); throw new UserException.CommandLineException("Pairs traversal cannot be used in conjunction with intervals.");
} }
// Use the experimental ReadShardBalancer if experimental downsampling is enabled
ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useExperimentalDownsampling ?
new ExperimentalReadShardBalancer() :
new ReadShardBalancer();
if(intervals == null) if(intervals == null)
return readsDataSource.createShardIteratorOverAllReads(new ReadShardBalancer()); return readsDataSource.createShardIteratorOverAllReads(readShardBalancer);
else else
return readsDataSource.createShardIteratorOverIntervals(intervals,new ReadShardBalancer()); return readsDataSource.createShardIteratorOverIntervals(intervals, readShardBalancer);
} }
else else
throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName()); throw new ReviewedStingException("Unable to determine walker type for walker " + walker.getClass().getName());
@ -639,14 +686,14 @@ public class GenomeAnalysisEngine {
// if include argument isn't given, create new set of all possible intervals // if include argument isn't given, create new set of all possible intervals
Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair( final Pair<GenomeLocSortedSet, GenomeLocSortedSet> includeExcludePair = IntervalUtils.parseIntervalBindingsPair(
this.referenceDataSource, this.referenceDataSource,
argCollection.intervals, argCollection.intervals,
argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding, argCollection.intervalSetRule, argCollection.intervalMerging, argCollection.intervalPadding,
argCollection.excludeIntervals); argCollection.excludeIntervals);
GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst(); final GenomeLocSortedSet includeSortedSet = includeExcludePair.getFirst();
GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond(); final GenomeLocSortedSet excludeSortedSet = includeExcludePair.getSecond();
// if no exclude arguments, can return parseIntervalArguments directly // if no exclude arguments, can return parseIntervalArguments directly
if ( excludeSortedSet == null ) if ( excludeSortedSet == null )
@ -657,13 +704,15 @@ public class GenomeAnalysisEngine {
intervals = includeSortedSet.subtractRegions(excludeSortedSet); intervals = includeSortedSet.subtractRegions(excludeSortedSet);
// logging messages only printed when exclude (-XL) arguments are given // logging messages only printed when exclude (-XL) arguments are given
long toPruneSize = includeSortedSet.coveredSize(); final long toPruneSize = includeSortedSet.coveredSize();
long toExcludeSize = excludeSortedSet.coveredSize(); final long toExcludeSize = excludeSortedSet.coveredSize();
long intervalSize = intervals.coveredSize(); final long intervalSize = intervals.coveredSize();
logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize)); logger.info(String.format("Initial include intervals span %d loci; exclude intervals span %d loci", toPruneSize, toExcludeSize));
logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)", logger.info(String.format("Excluding %d loci from original intervals (%.2f%% reduction)",
toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize))); toPruneSize - intervalSize, (toPruneSize - intervalSize) / (0.01 * toPruneSize)));
} }
logger.info(String.format("Processing %d bp from intervals", intervals.coveredSize()));
} }
/** /**
@ -697,13 +746,12 @@ public class GenomeAnalysisEngine {
protected void initializeDataSources() { protected void initializeDataSources() {
logger.info("Strictness is " + argCollection.strictnessLevel); logger.info("Strictness is " + argCollection.strictnessLevel);
// TODO -- REMOVE ME
BAQ.DEFAULT_GOP = argCollection.BAQGOP;
validateSuppliedReference(); validateSuppliedReference();
setReferenceDataSource(argCollection.referenceFile); setReferenceDataSource(argCollection.referenceFile);
validateSuppliedReads(); validateSuppliedReads();
initializeReadTransformers(walker);
readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference()); readsDataSource = createReadsDataSource(argCollection,genomeLocParser,referenceDataSource.getReference());
for (ReadFilter filter : filters) for (ReadFilter filter : filters)
@ -784,14 +832,13 @@ public class GenomeAnalysisEngine {
* @return A data source for the given set of reads. * @return A data source for the given set of reads.
*/ */
private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) { private SAMDataSource createReadsDataSource(GATKArgumentCollection argCollection, GenomeLocParser genomeLocParser, IndexedFastaSequenceFile refReader) {
DownsamplingMethod method = getDownsamplingMethod(); DownsamplingMethod downsamplingMethod = getDownsamplingMethod();
// Synchronize the method back into the collection so that it shows up when // Synchronize the method back into the collection so that it shows up when
// interrogating for the downsample method during command line recreation. // interrogating for the downsample method during command line recreation.
setDownsamplingMethod(method); setDownsamplingMethod(downsamplingMethod);
if ( getWalkerBAQApplicationTime() == BAQ.ApplicationTime.FORBIDDEN && argCollection.BAQMode != BAQ.CalculationMode.OFF) logger.info(downsamplingMethod);
throw new UserException.BadArgumentValue("baq", "Walker cannot accept BAQ'd base qualities, and yet BAQ mode " + argCollection.BAQMode + " was requested.");
if (argCollection.removeProgramRecords && argCollection.keepProgramRecords) if (argCollection.removeProgramRecords && argCollection.keepProgramRecords)
throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options"); throw new UserException.BadArgumentValue("rpr / kpr", "Cannot enable both options");
@ -809,14 +856,11 @@ public class GenomeAnalysisEngine {
argCollection.useOriginalBaseQualities, argCollection.useOriginalBaseQualities,
argCollection.strictnessLevel, argCollection.strictnessLevel,
argCollection.readBufferSize, argCollection.readBufferSize,
method, downsamplingMethod,
new ValidationExclusion(Arrays.asList(argCollection.unsafe)), new ValidationExclusion(Arrays.asList(argCollection.unsafe)),
filters, filters,
readTransformers,
includeReadsWithDeletionAtLoci(), includeReadsWithDeletionAtLoci(),
getWalkerBAQApplicationTime() == BAQ.ApplicationTime.ON_INPUT ? argCollection.BAQMode : BAQ.CalculationMode.OFF,
getWalkerBAQQualityMode(),
refReader,
getBaseRecalibration(),
argCollection.defaultBaseQualities, argCollection.defaultBaseQualities,
removeProgramRecords); removeProgramRecords);
} }
@ -943,6 +987,22 @@ public class GenomeAnalysisEngine {
return this.intervals; return this.intervals;
} }
/**
* Get the list of regions of the genome being processed. If the user
* requested specific intervals, return those, otherwise return regions
* corresponding to the entire genome. Never returns null.
*
* @return a non-null set of intervals being processed
*/
@Ensures("result != null")
public GenomeLocSortedSet getRegionsOfGenomeBeingProcessed() {
if ( getIntervals() == null )
// if we don't have any intervals defined, create intervals from the reference itself
return GenomeLocSortedSet.createSetFromSequenceDictionary(getReferenceDataSource().getReference().getSequenceDictionary());
else
return getIntervals();
}
/** /**
* Gets the list of filters employed by this engine. * Gets the list of filters employed by this engine.
* @return Collection of filters (actual instances) used by this engine. * @return Collection of filters (actual instances) used by this engine.
@ -1000,7 +1060,19 @@ public class GenomeAnalysisEngine {
* owned by the caller; the caller can do with the object what they wish. * owned by the caller; the caller can do with the object what they wish.
*/ */
public ReadMetrics getCumulativeMetrics() { public ReadMetrics getCumulativeMetrics() {
return readsDataSource == null ? null : readsDataSource.getCumulativeReadMetrics(); // todo -- probably shouldn't be lazy
if ( cumulativeMetrics == null )
cumulativeMetrics = readsDataSource == null ? new ReadMetrics() : readsDataSource.getCumulativeReadMetrics();
return cumulativeMetrics;
}
/**
* Return the global ThreadEfficiencyMonitor, if there is one
*
* @return the monitor, or null if none is active
*/
public ThreadEfficiencyMonitor getThreadEfficiencyMonitor() {
return threadEfficiencyMonitor;
} }
// ------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------
@ -1020,6 +1092,33 @@ public class GenomeAnalysisEngine {
public String createApproximateCommandLineArgumentString(Object... argumentProviders) { public String createApproximateCommandLineArgumentString(Object... argumentProviders) {
return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders); return CommandLineUtils.createApproximateCommandLineArgumentString(parsingEngine,argumentProviders);
} }
/**
* Does the current runtime in unit exceed the runtime limit, if one has been provided?
*
* @param runtime the runtime of this GATK instance in minutes
* @param unit the time unit of runtime
* @return false if not limit was requested or if runtime <= the limit, true otherwise
*/
public boolean exceedsRuntimeLimit(final long runtime, final TimeUnit unit) {
if ( runtime < 0 ) throw new IllegalArgumentException("runtime must be >= 0 but got " + runtime);
if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT )
return false;
else {
final long actualRuntimeNano = TimeUnit.NANOSECONDS.convert(runtime, unit);
final long maxRuntimeNano = getRuntimeLimitInNanoseconds();
return actualRuntimeNano > maxRuntimeNano;
}
}
/**
* @return the runtime limit in nanoseconds, or -1 if no limit was specified
*/
public long getRuntimeLimitInNanoseconds() {
if ( getArguments().maxRuntime == NO_RUNTIME_LIMIT )
return -1;
else
return TimeUnit.NANOSECONDS.convert(getArguments().maxRuntime, getArguments().maxRuntimeUnits);
}
} }

View File

@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk;
import net.sf.picard.filter.SamRecordFilter; import net.sf.picard.filter.SamRecordFilter;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
@ -119,11 +118,18 @@ public class ReadMetrics implements Cloneable {
return nRecords; return nRecords;
} }
/**
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
*/
public void incrementNumIterations(final long by) {
nRecords += by;
}
/** /**
* Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed. * Increments the number of 'iterations' (one call of filter/map/reduce sequence) completed.
*/ */
public void incrementNumIterations() { public void incrementNumIterations() {
nRecords++; incrementNumIterations(1);
} }
public long getNumReadsSeen() { public long getNumReadsSeen() {

View File

@ -1,15 +1,15 @@
package org.broadinstitute.sting.gatk; package org.broadinstitute.sting.gatk;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.samtools.SAMFileHeader; import net.sf.samtools.SAMFileHeader;
import net.sf.samtools.SAMFileReader; import net.sf.samtools.SAMFileReader;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID; import org.broadinstitute.sting.gatk.datasources.reads.SAMReaderID;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import java.util.Collection; import java.util.Collection;
import java.util.List;
/** /**
* User: hanna * User: hanna
* Date: May 14, 2009 * Date: May 14, 2009
@ -30,16 +30,14 @@ import java.util.Collection;
public class ReadProperties { public class ReadProperties {
private final Collection<SAMReaderID> readers; private final Collection<SAMReaderID> readers;
private final SAMFileHeader header; private final SAMFileHeader header;
private final SAMFileHeader.SortOrder sortOrder;
private final SAMFileReader.ValidationStringency validationStringency; private final SAMFileReader.ValidationStringency validationStringency;
private final DownsamplingMethod downsamplingMethod; private final DownsamplingMethod downsamplingMethod;
private final ValidationExclusion exclusionList; private final ValidationExclusion exclusionList;
private final Collection<ReadFilter> supplementalFilters; private final Collection<ReadFilter> supplementalFilters;
private final List<ReadTransformer> readTransformers;
private final boolean includeReadsWithDeletionAtLoci; private final boolean includeReadsWithDeletionAtLoci;
private final boolean useOriginalBaseQualities; private final boolean useOriginalBaseQualities;
private final BAQ.CalculationMode cmode;
private final BAQ.QualityMode qmode;
private final IndexedFastaSequenceFile refReader; // read for BAQ, if desired
private final BaseRecalibration bqsrApplier;
private final byte defaultBaseQualities; private final byte defaultBaseQualities;
/** /**
@ -67,6 +65,14 @@ public class ReadProperties {
return header; return header;
} }
/**
* Gets the sort order of the reads
* @return the sort order of the reads
*/
public SAMFileHeader.SortOrder getSortOrder() {
return sortOrder;
}
/** /**
* How strict should validation be? * How strict should validation be?
* @return Stringency of validation. * @return Stringency of validation.
@ -95,6 +101,11 @@ public class ReadProperties {
return supplementalFilters; return supplementalFilters;
} }
public List<ReadTransformer> getReadTransformers() {
return readTransformers;
}
/** /**
* Return whether to use original base qualities. * Return whether to use original base qualities.
* @return Whether to use original base qualities. * @return Whether to use original base qualities.
@ -103,16 +114,6 @@ public class ReadProperties {
return useOriginalBaseQualities; return useOriginalBaseQualities;
} }
public BAQ.QualityMode getBAQQualityMode() { return qmode; }
public BAQ.CalculationMode getBAQCalculationMode() { return cmode; }
public IndexedFastaSequenceFile getRefReader() {
return refReader;
}
public BaseRecalibration getBQSRApplier() { return bqsrApplier; }
/** /**
* @return Default base quality value to fill reads missing base quality information. * @return Default base quality value to fill reads missing base quality information.
*/ */
@ -134,36 +135,29 @@ public class ReadProperties {
* @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method * @param includeReadsWithDeletionAtLoci if 'true', the base pileups sent to the walker's map() method
* will explicitly list reads with deletion over the current reference base; otherwise, only observed * will explicitly list reads with deletion over the current reference base; otherwise, only observed
* bases will be seen in the pileups, and the deletions will be skipped silently. * bases will be seen in the pileups, and the deletions will be skipped silently.
* @param cmode How should we apply the BAQ calculation to the reads?
* @param qmode How should we apply the BAQ calculation to the reads?
* @param refReader if applyBAQ is true, must be a valid pointer to a indexed fasta file reads so we can get the ref bases for BAQ calculation
* @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality. * @param defaultBaseQualities if the reads have incomplete quality scores, set them all to defaultBaseQuality.
*/ */
public ReadProperties( Collection<SAMReaderID> samFiles, public ReadProperties( Collection<SAMReaderID> samFiles,
SAMFileHeader header, SAMFileHeader header,
SAMFileHeader.SortOrder sortOrder,
boolean useOriginalBaseQualities, boolean useOriginalBaseQualities,
SAMFileReader.ValidationStringency strictness, SAMFileReader.ValidationStringency strictness,
DownsamplingMethod downsamplingMethod, DownsamplingMethod downsamplingMethod,
ValidationExclusion exclusionList, ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters, Collection<ReadFilter> supplementalFilters,
List<ReadTransformer> readTransformers,
boolean includeReadsWithDeletionAtLoci, boolean includeReadsWithDeletionAtLoci,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
byte defaultBaseQualities) { byte defaultBaseQualities) {
this.readers = samFiles; this.readers = samFiles;
this.header = header; this.header = header;
this.sortOrder = sortOrder;
this.validationStringency = strictness; this.validationStringency = strictness;
this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod; this.downsamplingMethod = downsamplingMethod == null ? DownsamplingMethod.NONE : downsamplingMethod;
this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList; this.exclusionList = exclusionList == null ? new ValidationExclusion() : exclusionList;
this.supplementalFilters = supplementalFilters; this.supplementalFilters = supplementalFilters;
this.readTransformers = readTransformers;
this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci; this.includeReadsWithDeletionAtLoci = includeReadsWithDeletionAtLoci;
this.useOriginalBaseQualities = useOriginalBaseQualities; this.useOriginalBaseQualities = useOriginalBaseQualities;
this.cmode = cmode;
this.qmode = qmode;
this.refReader = refReader;
this.bqsrApplier = bqsrApplier;
this.defaultBaseQualities = defaultBaseQualities; this.defaultBaseQualities = defaultBaseQualities;
} }
} }

View File

@ -27,15 +27,18 @@ package org.broadinstitute.sting.gatk;
import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.filters.FilterManager; import org.broadinstitute.sting.gatk.filters.FilterManager;
import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.filters.ReadFilter;
import org.broadinstitute.sting.gatk.iterators.ReadTransformer;
import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.*;
import org.broadinstitute.sting.utils.baq.BAQ;
import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.classloader.PluginManager;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet; import org.broadinstitute.sting.utils.help.ResourceBundleExtractorDoclet;
import org.broadinstitute.sting.utils.text.TextFormattingUtils; import org.broadinstitute.sting.utils.text.TextFormattingUtils;
import java.lang.annotation.Annotation;
import java.util.*; import java.util.*;
/** /**
@ -303,9 +306,10 @@ public class WalkerManager extends PluginManager<Walker> {
* downsampling method is specified on the command-line, the command-line version will * downsampling method is specified on the command-line, the command-line version will
* be used instead. * be used instead.
* @param walkerClass The class of the walker to interrogate. * @param walkerClass The class of the walker to interrogate.
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
* @return The downsampling method, as specified by the walker. Null if none exists. * @return The downsampling method, as specified by the walker. Null if none exists.
*/ */
public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass) { public static DownsamplingMethod getDownsamplingMethod(Class<? extends Walker> walkerClass, boolean useExperimentalDownsampling) {
DownsamplingMethod downsamplingMethod = null; DownsamplingMethod downsamplingMethod = null;
if( walkerClass.isAnnotationPresent(Downsample.class) ) { if( walkerClass.isAnnotationPresent(Downsample.class) ) {
@ -313,17 +317,17 @@ public class WalkerManager extends PluginManager<Walker> {
DownsampleType type = downsampleParameters.by(); DownsampleType type = downsampleParameters.by();
Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null; Integer toCoverage = downsampleParameters.toCoverage() >= 0 ? downsampleParameters.toCoverage() : null;
Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null; Double toFraction = downsampleParameters.toFraction() >= 0.0d ? downsampleParameters.toFraction() : null;
downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction); downsamplingMethod = new DownsamplingMethod(type,toCoverage,toFraction,useExperimentalDownsampling);
} }
return downsamplingMethod; return downsamplingMethod;
} }
public static BAQ.QualityMode getBAQQualityMode(Walker walker) { public static <T extends Annotation> T getWalkerAnnotation(final Walker walker, final Class<T> clazz) {
return walker.getClass().getAnnotation(BAQMode.class).QualityMode(); return walker.getClass().getAnnotation(clazz);
} }
public static BAQ.ApplicationTime getBAQApplicationTime(Walker walker) { public static ReadTransformer.ApplicationTime getBAQApplicationTime(Walker walker) {
return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime(); return walker.getClass().getAnnotation(BAQMode.class).ApplicationTime();
} }
@ -332,10 +336,11 @@ public class WalkerManager extends PluginManager<Walker> {
* downsampling method is specified on the command-line, the command-line version will * downsampling method is specified on the command-line, the command-line version will
* be used instead. * be used instead.
* @param walker The walker to interrogate. * @param walker The walker to interrogate.
* @param useExperimentalDownsampling If true, use the experimental downsampling implementation
* @return The downsampling method, as specified by the walker. Null if none exists. * @return The downsampling method, as specified by the walker. Null if none exists.
*/ */
public static DownsamplingMethod getDownsamplingMethod(Walker walker) { public static DownsamplingMethod getDownsamplingMethod(Walker walker, boolean useExperimentalDownsampling) {
return getDownsamplingMethod(walker.getClass()); return getDownsamplingMethod(walker.getClass(), useExperimentalDownsampling);
} }
/** /**

View File

@ -31,8 +31,9 @@ import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Hidden;
import org.broadinstitute.sting.commandline.Input; import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.IntervalBinding; import org.broadinstitute.sting.commandline.IntervalBinding;
import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport;
import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType;
import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.QualityUtils;
@ -41,7 +42,10 @@ import org.broadinstitute.sting.utils.interval.IntervalMergingRule;
import org.broadinstitute.sting.utils.interval.IntervalSetRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule;
import java.io.File; import java.io.File;
import java.util.*; import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.concurrent.TimeUnit;
/** /**
* @author aaron * @author aaron
@ -64,12 +68,35 @@ public class GATKArgumentCollection {
@Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false)
public Integer readBufferSize = null; public Integer readBufferSize = null;
// --------------------------------------------------------------------------------------------------------------
//
// GATKRunReport options
//
// --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false) @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD; public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD;
@Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false) @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see " + GATKRunReport.PHONE_HOME_DOCS_URL + " for details.", required = false)
public File gatkKeyFile = null; public File gatkKeyFile = null;
/**
* The GATKRunReport supports (as of GATK 2.2) tagging GATK runs with an arbitrary String tag that can be
* used to group together runs during later analysis. One use of this capability is to tag runs as GATK
* performance tests, so that the performance of the GATK over time can be assessed from the logs directly.
*
* Note that the tags do not conform to any ontology, so you are free to use any tags that you might find
* meaningful.
*/
@Argument(fullName = "tag", shortName = "tag", doc="Arbitrary tag string to identify this GATK run as part of a group of runs, for later analysis", required = false)
public String tag = "NA";
// --------------------------------------------------------------------------------------------------------------
//
// General features
//
// --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false)
public List<String> readFilters = new ArrayList<String>(); public List<String> readFilters = new ArrayList<String>();
@ -115,15 +142,20 @@ public class GATKArgumentCollection {
@Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false) @Argument(fullName = "nonDeterministicRandomSeed", shortName = "ndrs", doc = "Makes the GATK behave non deterministically, that is, the random numbers generated will be different in every run", required = false)
public boolean nonDeterministicRandomSeed = false; public boolean nonDeterministicRandomSeed = false;
/** @Argument(fullName = "disableRandomization",doc="Completely eliminates randomization from nondeterministic methods. To be used mostly in the testing framework where dynamic parallelism can result in differing numbers of calls to the generator.")
* The override mechanism in the GATK, by default, populates the command-line arguments, then public boolean disableRandomization = false;
* the defaults from the walker annotations. Unfortunately, walker annotations should be trumped
* by a user explicitly specifying command-line arguments.
* TODO: Change the GATK so that walker defaults are loaded first, then command-line arguments.
*/
private static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
private static int DEFAULT_DOWNSAMPLING_COVERAGE = 1000;
@Argument(fullName = "maxRuntime", shortName = "maxRuntime", doc="If provided, that GATK will stop execution cleanly as soon after maxRuntime has been exceeded, truncating the run but not exiting with a failure. By default the value is interpreted in minutes, but this can be changed by maxRuntimeUnits", required = false)
public long maxRuntime = GenomeAnalysisEngine.NO_RUNTIME_LIMIT;
@Argument(fullName = "maxRuntimeUnits", shortName = "maxRuntimeUnits", doc="The TimeUnit for maxRuntime", required = false)
public TimeUnit maxRuntimeUnits = TimeUnit.MINUTES;
// --------------------------------------------------------------------------------------------------------------
//
// Downsampling Arguments
//
// --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false) @Argument(fullName = "downsampling_type", shortName="dt", doc="Type of reads downsampling to employ at a given locus. Reads will be selected randomly to be removed from the pile based on the method described here", required = false)
public DownsampleType downsamplingType = null; public DownsampleType downsamplingType = null;
@ -133,17 +165,20 @@ public class GATKArgumentCollection {
@Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false) @Argument(fullName = "downsample_to_coverage", shortName = "dcov", doc = "Coverage [integer] to downsample to at any given locus; note that downsampled reads are randomly selected from all possible reads at a locus", required = false)
public Integer downsampleCoverage = null; public Integer downsampleCoverage = null;
@Argument(fullName = "enable_experimental_downsampling", shortName = "enable_experimental_downsampling", doc = "Enable experimental engine-level downsampling", required = false)
@Hidden
public boolean enableExperimentalDownsampling = false;
/** /**
* Gets the downsampling method explicitly specified by the user. If the user didn't specify * Gets the downsampling method explicitly specified by the user. If the user didn't specify
* a default downsampling mechanism, return the default. * a default downsampling mechanism, return the default.
* @return The explicitly specified downsampling mechanism, or the default if none exists. * @return The explicitly specified downsampling mechanism, or the default if none exists.
*/ */
public DownsamplingMethod getDownsamplingMethod() { public DownsamplingMethod getDownsamplingMethod() {
if(downsamplingType == null && downsampleFraction == null && downsampleCoverage == null) if ( downsamplingType == null && downsampleFraction == null && downsampleCoverage == null )
return null; return null;
if(downsamplingType == null && downsampleCoverage != null)
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,downsampleCoverage,null); return new DownsamplingMethod(downsamplingType, downsampleCoverage, downsampleFraction, enableExperimentalDownsampling);
return new DownsamplingMethod(downsamplingType,downsampleCoverage,downsampleFraction);
} }
/** /**
@ -153,9 +188,11 @@ public class GATKArgumentCollection {
public void setDownsamplingMethod(DownsamplingMethod method) { public void setDownsamplingMethod(DownsamplingMethod method) {
if (method == null) if (method == null)
throw new IllegalArgumentException("method is null"); throw new IllegalArgumentException("method is null");
downsamplingType = method.type; downsamplingType = method.type;
downsampleCoverage = method.toCoverage; downsampleCoverage = method.toCoverage;
downsampleFraction = method.toFraction; downsampleFraction = method.toFraction;
enableExperimentalDownsampling = method.useExperimentalDownsampling;
} }
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
@ -174,17 +211,14 @@ public class GATKArgumentCollection {
// performance log arguments // performance log arguments
// //
// -------------------------------------------------------------------------------------------------------------- // --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
public File performanceLog = null;
/** /**
* Gets the default downsampling method, returned if the user didn't specify any downsampling * The file name for the GATK performance log output, or null if you don't want to generate the
* method. * detailed performance logging table. This table is suitable for importing into R or any
* @return The default downsampling mechanism, or null if none exists. * other analysis software that can read tsv files
*/ */
public static DownsamplingMethod getDefaultDownsamplingMethod() { @Argument(fullName = "performanceLog", shortName="PF", doc="If provided, a GATK runtime performance log will be written to this file", required = false)
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE,DEFAULT_DOWNSAMPLING_COVERAGE,null); public File performanceLog = null;
}
@Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false) @Argument(fullName="useOriginalQualities", shortName = "OQ", doc = "If set, use the original base quality scores from the OQ tag when present instead of the standard scores", required=false)
public Boolean useOriginalBaseQualities = false; public Boolean useOriginalBaseQualities = false;
@ -256,20 +290,40 @@ public class GATKArgumentCollection {
@Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false) @Argument(fullName = "unsafe", shortName = "U", doc = "If set, enables unsafe operations: nothing will be checked at runtime. For expert users only who know what they are doing. We do not support usage of this argument.", required = false)
public ValidationExclusion.TYPE unsafe; public ValidationExclusion.TYPE unsafe;
/** How many threads should be allocated to this analysis. */ // --------------------------------------------------------------------------------------------------------------
@Argument(fullName = "num_threads", shortName = "nt", doc = "How many threads should be allocated to running this analysis.", required = false) //
public Integer numberOfThreads = 1; // Multi-threading arguments
//
// --------------------------------------------------------------------------------------------------------------
/** /**
* The following two arguments (num_cpu_threads, num_io_threads are TEMPORARY since Queue cannot currently support arbitrary tagged data types. * How many data threads should be allocated to this analysis? Data threads contains N cpu threads per
* TODO: Kill this when I can do a tagged integer in Queue. * data thread, and act as completely data parallel processing, increasing the memory usage of GATK
* by M data threads. Data threads generally scale extremely effectively, up to 24 cores
*/ */
@Argument(fullName="num_cpu_threads", shortName = "nct", doc="How many of the given threads should be allocated to the CPU", required = false) @Argument(fullName = "num_threads", shortName = "nt", doc = "How many data threads should be allocated to running this analysis.", required = false)
@Hidden public Integer numberOfDataThreads = 1;
public Integer numberOfCPUThreads = null;
/**
* How many CPU threads should be allocated per data thread? Each CPU thread operates the map
* cycle independently, but may run into earlier scaling problems with IO than data threads. Has
* the benefit of not requiring X times as much memory per thread as data threads do, but rather
* only a constant overhead.
*/
@Argument(fullName="num_cpu_threads_per_data_thread", shortName = "nct", doc="How many CPU threads should be allocated per data thread to running this analysis?", required = false)
public int numberOfCPUThreadsPerDataThread = 1;
@Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false) @Argument(fullName="num_io_threads", shortName = "nit", doc="How many of the given threads should be allocated to IO", required = false)
@Hidden @Hidden
public Integer numberOfIOThreads = null; public int numberOfIOThreads = 0;
/**
* Enable GATK to monitor its own threading efficiency, at a itsy-bitsy tiny
* cost (< 0.1%) in runtime because of turning on the JavaBean. This is largely for
* debugging purposes.
*/
@Argument(fullName = "monitorThreadEfficiency", shortName = "mte", doc = "Enable GATK threading efficiency monitoring", required = false)
public Boolean monitorThreadEfficiency = false;
@Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false) @Argument(fullName = "num_bam_file_handles", shortName = "bfh", doc="The total number of BAM file handles to keep open simultaneously", required=false)
public Integer numberOfBAMFileHandles = null; public Integer numberOfBAMFileHandles = null;

View File

@ -1,13 +1,14 @@
package org.broadinstitute.sting.gatk.arguments; package org.broadinstitute.sting.gatk.arguments;
import org.broadinstitute.sting.commandline.Advanced; import org.broadinstitute.sting.commandline.*;
import org.broadinstitute.sting.commandline.Argument;
import org.broadinstitute.sting.commandline.Input;
import org.broadinstitute.sting.commandline.RodBinding;
import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel;
import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine;
import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.AFCalcFactory;
import org.broadinstitute.sting.utils.variantcontext.VariantContext; import org.broadinstitute.sting.utils.variantcontext.VariantContext;
import java.io.File;
import java.io.PrintStream;
/** /**
* Created with IntelliJ IDEA. * Created with IntelliJ IDEA.
* User: rpoplin * User: rpoplin
@ -55,8 +56,51 @@ public class StandardCallerArgumentCollection {
* then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it * then only this many alleles will be used. Note that genotyping sites with many alternate alleles is both CPU and memory intensive and it
* scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend * scales exponentially based on the number of alternate alleles. Unless there is a good reason to change the default value, we highly recommend
* that you not play around with this parameter. * that you not play around with this parameter.
*
* As of GATK 2.2 the genotyper can handle a very large number of events, so the default maximum has been increased to 6.
*/ */
@Advanced @Advanced
@Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false) @Argument(fullName = "max_alternate_alleles", shortName = "maxAltAlleles", doc = "Maximum number of alternate alleles to genotype", required = false)
public int MAX_ALTERNATE_ALLELES = 3; public int MAX_ALTERNATE_ALLELES = 6;
/**
* Controls the model used to calculate the probability that a site is variant plus the various sample genotypes in the data at a given locus.
*/
@Advanced
@Argument(fullName = "p_nonref_model", shortName = "pnrm", doc = "Non-reference probability calculation model to employ", required = false)
public AFCalcFactory.Calculation AFmodel = AFCalcFactory.Calculation.getDefaultModel();
/**
* If this fraction is greater is than zero, the caller will aggressively attempt to remove contamination through biased down-sampling of reads.
* Basically, it will ignore the contamination fraction of reads for each alternate allele. So if the pileup contains N total bases, then we
* will try to remove (N * contamination fraction) bases for each alternate allele.
*/
@Argument(fullName = "contamination_fraction_to_filter", shortName = "contamination", doc = "Fraction of contamination in sequencing data (for all samples) to aggressively remove", required = false)
public double CONTAMINATION_FRACTION = DEFAULT_CONTAMINATION_FRACTION;
public static final double DEFAULT_CONTAMINATION_FRACTION = 0.05;
@Hidden
@Argument(fullName = "logRemovedReadsFromContaminationFiltering", shortName="contaminationLog", required=false)
public PrintStream contaminationLog = null;
@Hidden
@Argument(shortName = "logExactCalls", doc="x", required=false)
public File exactCallsLog = null;
public StandardCallerArgumentCollection() { }
// Developers must remember to add any newly added arguments to the list here as well otherwise they won't get changed from their default value!
public StandardCallerArgumentCollection(final StandardCallerArgumentCollection SCAC) {
this.alleles = SCAC.alleles;
this.GenotypingMode = SCAC.GenotypingMode;
this.heterozygosity = SCAC.heterozygosity;
this.MAX_ALTERNATE_ALLELES = SCAC.MAX_ALTERNATE_ALLELES;
this.OutputMode = SCAC.OutputMode;
this.STANDARD_CONFIDENCE_FOR_CALLING = SCAC.STANDARD_CONFIDENCE_FOR_CALLING;
this.STANDARD_CONFIDENCE_FOR_EMITTING = SCAC.STANDARD_CONFIDENCE_FOR_EMITTING;
this.CONTAMINATION_FRACTION = SCAC.CONTAMINATION_FRACTION;
this.contaminationLog = SCAC.contaminationLog;
this.exactCallsLog = SCAC.exactCallsLog;
this.AFmodel = SCAC.AFmodel;
}
} }

View File

@ -177,7 +177,7 @@ public class ReferenceContext {
* @return The base at the given locus from the reference. * @return The base at the given locus from the reference.
*/ */
public byte getBase() { public byte getBase() {
return getBases()[(int)(locus.getStart() - window.getStart())]; return getBases()[(locus.getStart() - window.getStart())];
} }
/** /**

View File

@ -0,0 +1,143 @@
package org.broadinstitute.sting.gatk.datasources.providers;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.util.PeekableIterator;
import org.broadinstitute.sting.gatk.refdata.RODRecordListImpl;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.util.Collection;
import java.util.LinkedList;
import java.util.ListIterator;
/**
* Key algorithmic helper for ReadBasedReferenceOrderedData
*
* Takes a single iterator of features, and provides a single capability that returns
* the list of RODs that overlap an interval. Allows sequential getOverlapping calls
* from intervals provided that these intervals always have increasing getStart() values.
*
*/
class IntervalOverlappingRODsFromStream {
/**
* Only held for QC purposes
*/
GenomeLoc lastQuery = null;
private final String name;
private final LinkedList<GATKFeature> currentFeatures = new LinkedList<GATKFeature>();
private final PeekableIterator<RODRecordList> futureFeatures;
/**
* Create a new IntervalOverlappingRODsFromStream that reads elements from futureFeatures and
* returns RODRecordLists having name
*
* @param name
* @param futureFeatures
*/
IntervalOverlappingRODsFromStream(final String name, final PeekableIterator<RODRecordList> futureFeatures) {
if ( futureFeatures == null ) throw new IllegalArgumentException("futureFeatures cannot be null");
this.name = name;
this.futureFeatures = futureFeatures;
}
/**
* Get the list of RODs overlapping loc from this stream of RODs.
*
* Sequential calls to this function must obey the rule that loc2.getStart >= loc1.getStart
*
* @param loc the interval to query
* @return a non-null RODRecordList containing the overlapping RODs, which may be empty
*/
@Ensures({"overlaps(loc, result)",
"! futureFeatures.hasNext() || futureFeatures.peek().getLocation().isPast(loc)",
"result != null"})
public RODRecordList getOverlapping(final GenomeLoc loc) {
if ( lastQuery != null && loc.getStart() < lastQuery.getStart() )
throw new IllegalArgumentException(String.format("BUG: query interval (%s) starts before the previous interval %s", loc, lastQuery));
trimCurrentFeaturesToLoc(loc);
readOverlappingFutureFeatures(loc);
return new RODRecordListImpl(name, subsetToOverlapping(loc, currentFeatures), loc);
}
/**
* For contract assurance. Checks that all bindings in loc overlap
*
* @param loc
* @param bindings
* @return
*/
@Requires({"loc != null", "bindings != null"})
private boolean overlaps(final GenomeLoc loc, final RODRecordList bindings) {
for ( final GATKFeature feature : bindings )
if ( ! feature.getLocation().overlapsP(loc) )
return false;
return true;
}
/**
* Subset the features in all to those that overlap with loc
*
* The current features list contains everything read that cannot be thrown away yet, but not
* everything in there necessarily overlaps with loc. Subset to just those that do overlap
*
* @param loc the location that features must overlap
* @param all the list of all features
* @return a subset of all that overlaps with loc
*/
@Requires({"loc != null", "all != null"})
@Ensures("result.size() <= all.size()")
private Collection<GATKFeature> subsetToOverlapping(final GenomeLoc loc, final Collection<GATKFeature> all) {
final LinkedList<GATKFeature> overlapping = new LinkedList<GATKFeature>();
for ( final GATKFeature feature : all )
if ( feature.getLocation().overlapsP(loc) )
overlapping.add(feature);
return overlapping;
}
/**
* Update function. Remove all elements of currentFeatures that end before loc
*
* @param loc the location to use
*/
@Requires("loc != null")
@Ensures("currentFeatures.size() <= old(currentFeatures.size())")
private void trimCurrentFeaturesToLoc(final GenomeLoc loc) {
final ListIterator<GATKFeature> it = currentFeatures.listIterator();
while ( it.hasNext() ) {
final GATKFeature feature = it.next();
if ( feature.getLocation().isBefore(loc) )
it.remove();
}
}
/**
* Update function: Read all elements from futureFeatures that overlap with loc
*
* Stops at the first element that starts before the end of loc, or the stream empties
*
* @param loc
*/
@Requires("loc != null")
@Ensures("currentFeatures.size() >= old(currentFeatures.size())")
private void readOverlappingFutureFeatures(final GenomeLoc loc) {
while ( futureFeatures.hasNext() ) {
final GenomeLoc nextLoc = futureFeatures.peek().getLocation();
if ( nextLoc.isBefore(loc) ) {
futureFeatures.next(); // next rod element is before loc, throw it away and keep looking
} else if ( nextLoc.isPast(loc) ) {
break; // next element is past loc, stop looking but don't pop it
} else if ( nextLoc.overlapsP(loc) ) {
// add overlapping elements to our current features, removing from stream
for ( final GATKFeature feature : futureFeatures.next() ) {
currentFeatures.add(feature);
}
}
}
}
}

View File

@ -1,6 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.providers; package org.broadinstitute.sting.gatk.datasources.providers;
import org.broadinstitute.sting.gatk.DownsampleType; import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
import org.broadinstitute.sting.gatk.iterators.LocusIterator; import org.broadinstitute.sting.gatk.iterators.LocusIterator;
@ -135,8 +135,13 @@ public abstract class LocusView extends LocusIterator implements View {
// Cache the current and apply filtering. // Cache the current and apply filtering.
AlignmentContext current = nextLocus; AlignmentContext current = nextLocus;
if( sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null )
// The old ALL_READS downsampling implementation -- only use if we're not using the new experimental downsampling:
if( ! sourceInfo.getDownsamplingMethod().useExperimentalDownsampling &&
sourceInfo.getDownsamplingMethod().type == DownsampleType.ALL_READS && sourceInfo.getDownsamplingMethod().toCoverage != null ) {
current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage ); current.downsampleToCoverage( sourceInfo.getDownsamplingMethod().toCoverage );
}
// Indicate that the next operation will need to advance. // Indicate that the next operation will need to advance.
nextLocus = null; nextLocus = null;

View File

@ -58,7 +58,7 @@ public class ManagingReferenceOrderedView implements ReferenceOrderedView {
// todo -- warning, I removed the reference to the name from states // todo -- warning, I removed the reference to the name from states
bindings.add( state.iterator.seekForward(loc) ); bindings.add( state.iterator.seekForward(loc) );
return new RefMetaDataTracker(bindings, referenceContext); return new RefMetaDataTracker(bindings);
} }
/** /**

View File

@ -23,40 +23,63 @@
package org.broadinstitute.sting.gatk.datasources.providers; package org.broadinstitute.sting.gatk.datasources.providers;
import com.google.java.contract.Ensures;
import com.google.java.contract.Requires;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.datasources.reads.ReadShard;
import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource;
import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator; import org.broadinstitute.sting.gatk.refdata.utils.LocationAwareSeekableRODIterator;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.TreeMap;
/** a ROD view for reads. This provides the Read traversals a way of getting a ReadMetaDataTracker */ /** a ROD view for reads. This provides the Read traversals a way of getting a RefMetaDataTracker */
public class ReadBasedReferenceOrderedView implements View { public class ReadBasedReferenceOrderedView implements View {
private final WindowedData window; // a list of the RMDDataState (location->iterators)
private final List<RMDDataState> states = new ArrayList<RMDDataState>(1);
public ReadBasedReferenceOrderedView(ShardDataProvider provider) { private final static RefMetaDataTracker EMPTY_TRACKER = new RefMetaDataTracker();
window = new WindowedData(provider);
provider.register(this);
}
/** /**
* for testing only please * Used to get genome locs for reads
*
* @param data the window provider
*/ */
ReadBasedReferenceOrderedView(WindowedData data) { private final GenomeLocParser genomeLocParser;
window = data;
/**
* The total extent of all reads in this span. We create iterators from our RODs
* from the start of this span, to the end.
*/
private final GenomeLoc shardSpan;
public ReadBasedReferenceOrderedView(final ShardDataProvider provider) {
this.genomeLocParser = provider.getGenomeLocParser();
// conditional to optimize the case where we don't have any ROD data
this.shardSpan = provider.getReferenceOrderedData() != null ? ((ReadShard)provider.getShard()).getReadsSpan() : null;
provider.register(this);
if ( provider.getReferenceOrderedData() != null && ! shardSpan.isUnmapped() ) {
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
states.add(new RMDDataState(dataSource, dataSource.seek(shardSpan)));
}
} }
public ReadMetaDataTracker getReferenceOrderedDataForRead(SAMRecord read) {
return window.getTracker(read); /**
* Testing constructor
*/
protected ReadBasedReferenceOrderedView(final GenomeLocParser genomeLocParser,
final GenomeLoc shardSpan,
final List<String> names,
final List<PeekableIterator<RODRecordList>> featureSources) {
this.genomeLocParser = genomeLocParser;
this.shardSpan = shardSpan;
for ( int i = 0; i < names.size(); i++ )
states.add(new RMDDataState(names.get(i), featureSources.get(i)));
} }
public Collection<Class<? extends View>> getConflictingViews() { public Collection<Class<? extends View>> getConflictingViews() {
@ -65,135 +88,72 @@ public class ReadBasedReferenceOrderedView implements View {
return classes; return classes;
} }
public void close() {
if (window != null) window.close();
}
}
/** stores a window of data, dropping RODs if we've passed the new reads start point. */
class WindowedData {
// the queue of possibly in-frame RODs; RODs are removed as soon as they are out of scope
private final TreeMap<Integer, RODMetaDataContainer> mapping = new TreeMap<Integer, RODMetaDataContainer>();
// our current location from the last read we processed
private GenomeLoc currentLoc;
// a list of the RMDDataState (location->iterators)
private List<RMDDataState> states;
// the provider; where we get all our information
private final ShardDataProvider provider;
/** /**
* our log, which we want to capture anything from this class * create a RefMetaDataTracker given the current read
*/
private static Logger logger = Logger.getLogger(WindowedData.class);
/**
* create a WindowedData given a shard provider
*
* @param provider the ShardDataProvider
*/
public WindowedData(ShardDataProvider provider) {
this.provider = provider;
}
/**
* load the states dynamically, since the only way to get a genome loc is from the read (the shard doesn't have one)
*
* @param provider the ShardDataProvider
* @param rec the current read
*/
private void getStates(ShardDataProvider provider, SAMRecord rec) {
int stop = Integer.MAX_VALUE;
// figure out the appropriate alignment stop
if (provider.hasReference()) {
stop = provider.getReference().getSequenceDictionary().getSequence(rec.getReferenceIndex()).getSequenceLength();
}
// calculate the range of positions we need to look at
GenomeLoc range = provider.getGenomeLocParser().createGenomeLoc(rec.getReferenceName(),
rec.getAlignmentStart(),
stop);
states = new ArrayList<RMDDataState>();
if (provider.getReferenceOrderedData() != null)
for (ReferenceOrderedDataSource dataSource : provider.getReferenceOrderedData())
states.add(new RMDDataState(dataSource, dataSource.seek(range)));
}
/**
* this function is for testing only
*
* @param states a list of RMDDataState to initialize with
*/
WindowedData(List<RMDDataState> states) {
this.states = states;
provider = null;
}
/**
* create a ReadMetaDataTracker given the current read
* *
* @param rec the read * @param rec the read
* *
* @return a ReadMetaDataTracker for the read, from which you can get ROD -> read alignments * @return a RefMetaDataTracker for the read, from which you can get ROD -> read alignments
*/ */
public ReadMetaDataTracker getTracker(SAMRecord rec) { @Requires("rec != null")
updatePosition(rec); @Ensures("result != null")
return new ReadMetaDataTracker(provider.getGenomeLocParser(), rec, mapping); public RefMetaDataTracker getReferenceOrderedDataForRead(final SAMRecord rec) {
if ( rec.getReadUnmappedFlag() )
// empty RODs for unmapped reads
return new RefMetaDataTracker();
else
return getReferenceOrderedDataForInterval(genomeLocParser.createGenomeLoc(rec));
} }
/** @Requires({"interval != null", "shardSpan == null || shardSpan.isUnmapped() || shardSpan.containsP(interval)"})
* update the position we're storing @Ensures("result != null")
* public RefMetaDataTracker getReferenceOrderedDataForInterval(final GenomeLoc interval) {
* @param rec the read to use for start and end if ( states.isEmpty() || shardSpan.isUnmapped() ) // optimization for no bindings (common for read walkers)
*/ return EMPTY_TRACKER;
private void updatePosition(SAMRecord rec) { else {
if (states == null) getStates(this.provider, rec); final List<RODRecordList> bindings = new ArrayList<RODRecordList>(states.size());
currentLoc = provider.getGenomeLocParser().createGenomeLoc(rec); for ( final RMDDataState state : states )
bindings.add(state.stream.getOverlapping(interval));
// flush the queue looking for records we've passed over return new RefMetaDataTracker(bindings);
while (mapping.size() > 0 && mapping.firstKey() < currentLoc.getStart())
mapping.pollFirstEntry(); // toss away records that we've passed
// add new data to the queue
for (RMDDataState state : states) {
// move into position
while (state.iterator.hasNext() && state.iterator.peekNextLocation().isBefore(currentLoc))
state.iterator.next();
while (state.iterator.hasNext() && state.iterator.peekNextLocation().overlapsP(currentLoc)) {
RODRecordList list = state.iterator.next();
for (GATKFeature datum : list) {
if (!mapping.containsKey(list.getLocation().getStart()))
mapping.put(list.getLocation().getStart(), new RODMetaDataContainer());
mapping.get(list.getLocation().getStart()).addEntry(datum);
}
}
} }
} }
/** Closes the current view. */ /**
* Closes the current view.
*/
public void close() { public void close() {
if (states == null) return; for (final RMDDataState state : states)
for (RMDDataState state : states) state.close();
state.dataSource.close( state.iterator );
// Clear out the existing data so that post-close() accesses to this data will fail-fast. // Clear out the existing data so that post-close() accesses to this data will fail-fast.
states = null; states.clear();
} }
/** Models the traversal state of a given ROD lane. */
private static class RMDDataState {
public final ReferenceOrderedDataSource dataSource;
public final IntervalOverlappingRODsFromStream stream;
private final LocationAwareSeekableRODIterator iterator;
} public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) {
this.dataSource = dataSource;
this.iterator = iterator;
this.stream = new IntervalOverlappingRODsFromStream(dataSource.getName(), new PeekableIterator<RODRecordList>(iterator));
}
/** Models the traversal state of a given ROD lane. */ /**
class RMDDataState { * For testing
public final ReferenceOrderedDataSource dataSource; */
public final LocationAwareSeekableRODIterator iterator; public RMDDataState(final String name, final PeekableIterator<RODRecordList> iterator) {
this.dataSource = null;
this.iterator = null;
this.stream = new IntervalOverlappingRODsFromStream(name, new PeekableIterator<RODRecordList>(iterator));
}
public RMDDataState(ReferenceOrderedDataSource dataSource, LocationAwareSeekableRODIterator iterator) { public void close() {
this.dataSource = dataSource; if ( dataSource != null )
this.iterator = iterator; dataSource.close( iterator );
}
} }
} }

View File

@ -59,16 +59,18 @@ public class ReadReferenceView extends ReferenceView {
} }
public byte[] getBases() { public byte[] getBases() {
// System.out.printf("Getting bases for location %s%n", loc);
// throw new StingException("x");
return getReferenceBases(loc); return getReferenceBases(loc);
} }
} }
public ReferenceContext getReferenceContext( SAMRecord read ) { /**
* Return a reference context appropriate for the span of read
*
* @param read the mapped read to test
* @return
*/
public ReferenceContext getReferenceContext( final SAMRecord read ) {
GenomeLoc loc = genomeLocParser.createGenomeLoc(read); GenomeLoc loc = genomeLocParser.createGenomeLoc(read);
// byte[] bases = super.getReferenceBases(loc);
// return new ReferenceContext( loc, loc, bases );
return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) ); return new ReferenceContext( genomeLocParser, loc, loc, getReferenceBasesProvider(loc) );
} }

View File

@ -101,7 +101,7 @@ public class RodLocusView extends LocusView implements ReferenceOrderedView {
public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) { public RefMetaDataTracker getReferenceOrderedDataAtLocus( GenomeLoc loc, ReferenceContext referenceContext ) {
// special case the interval again -- add it into the ROD // special case the interval again -- add it into the ROD
if ( interval != null ) { allTracksHere.add(interval); } if ( interval != null ) { allTracksHere.add(interval); }
return new RefMetaDataTracker(allTracksHere, referenceContext); return new RefMetaDataTracker(allTracksHere);
} }
public boolean hasNext() { public boolean hasNext() {

View File

@ -94,6 +94,13 @@ public abstract class ShardDataProvider {
return referenceOrderedData; return referenceOrderedData;
} }
/**
* @return true if reference ordered data will be provided by this shard
*/
public boolean hasReferenceOrderedData() {
return ! getReferenceOrderedData().isEmpty();
}
/** /**
* Create a data provider for the shard given the reads and reference. * Create a data provider for the shard given the reads and reference.
* @param shard The chunk of data over which traversals happen. * @param shard The chunk of data over which traversals happen.

View File

@ -124,7 +124,24 @@ public class BAMScheduler implements Iterator<FilePointer> {
*/ */
private FilePointer generatePointerOverEntireFileset() { private FilePointer generatePointerOverEntireFileset() {
FilePointer filePointer = new FilePointer(); FilePointer filePointer = new FilePointer();
Map<SAMReaderID,GATKBAMFileSpan> currentPosition = dataSource.getCurrentPosition();
// This is a "monolithic" FilePointer representing all regions in all files we will ever visit, and is
// the only FilePointer we will create. This allows us to have this FilePointer represent regions from
// multiple contigs
filePointer.setIsMonolithic(true);
Map<SAMReaderID,GATKBAMFileSpan> currentPosition;
// Only use the deprecated SAMDataSource.getCurrentPosition() if we're not using experimental downsampling
// TODO: clean this up once the experimental downsampling engine fork collapses
if ( dataSource.getReadsInfo().getDownsamplingMethod() != null && dataSource.getReadsInfo().getDownsamplingMethod().useExperimentalDownsampling ) {
currentPosition = dataSource.getInitialReaderPositions();
}
else {
currentPosition = dataSource.getCurrentPosition();
}
for(SAMReaderID reader: dataSource.getReaderIDs()) for(SAMReaderID reader: dataSource.getReaderIDs())
filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart())); filePointer.addFileSpans(reader,createSpanToEndOfFile(currentPosition.get(reader).getGATKChunks().get(0).getChunkStart()));
return filePointer; return filePointer;

View File

@ -0,0 +1,228 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMRecord;
import org.apache.log4j.Logger;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.*;
/**
* Convert from an unbalanced iterator over FilePointers to a balanced iterator over Shards.
*
* When processing FilePointers, our strategy is to aggregate all FilePointers for each contig
* together into one monolithic FilePointer, create one persistent set of read iterators over
* that monolithic FilePointer, and repeatedly use that persistent set of read iterators to
* fill read shards with reads.
*
* This strategy has several important advantages:
*
* 1. We avoid issues with file span overlap. FilePointers that are more granular than a whole
* contig will have regions that overlap with other FilePointers on the same contig, due
* to the limited granularity of BAM index data. By creating only one FilePointer per contig,
* we avoid having to track how much of each file region we've visited (as we did in the
* former implementation), we avoid expensive non-sequential access patterns in the files,
* and we avoid having to repeatedly re-create our iterator chain for every small region
* of interest.
*
* 2. We avoid boundary issues with the engine-level downsampling. Since we create a single
* persistent set of read iterators (which include the downsampling iterator(s)) per contig,
* the downsampling process is never interrupted by FilePointer or Shard boundaries, and never
* loses crucial state information while downsampling within a contig.
*
* TODO: There is also at least one important disadvantage:
*
* 1. We load more BAM index data into memory at once, and this work is done upfront before processing
* the next contig, creating a delay before traversal of each contig. This delay may be
* compensated for by the gains listed in #1 above, and we may be no worse off overall in
* terms of total runtime, but we need to verify this empirically.
*
* @author David Roazen
*/
public class ExperimentalReadShardBalancer extends ShardBalancer {
private static Logger logger = Logger.getLogger(ExperimentalReadShardBalancer.class);
/**
* Convert iterators of file pointers into balanced iterators of shards.
* @return An iterator over balanced shards.
*/
public Iterator<Shard> iterator() {
return new Iterator<Shard>() {
/**
* The cached shard to be returned next. Prefetched in the peekable iterator style.
*/
private Shard nextShard = null;
/**
* The file pointer currently being processed.
*/
private FilePointer currentContigFilePointer = null;
/**
* Iterator over the reads from the current contig's file pointer. The same iterator will be
* used to fill all shards associated with a given file pointer
*/
private PeekableIterator<SAMRecord> currentContigReadsIterator = null;
/**
* How many FilePointers have we pulled from the filePointers iterator?
*/
private int totalFilePointersConsumed = 0;
/**
* Have we encountered a monolithic FilePointer?
*/
private boolean encounteredMonolithicFilePointer = false;
{
createNextContigFilePointer();
advance();
}
public boolean hasNext() {
return nextShard != null;
}
public Shard next() {
if ( ! hasNext() )
throw new NoSuchElementException("No next read shard available");
Shard currentShard = nextShard;
advance();
return currentShard;
}
private void advance() {
nextShard = null;
// May need multiple iterations to fill the next shard if all reads in current file spans get filtered/downsampled away
while ( nextShard == null && currentContigFilePointer != null ) {
// If we've exhausted the current file pointer of reads, move to the next file pointer (if there is one):
if ( currentContigReadsIterator != null && ! currentContigReadsIterator.hasNext() ) {
// Close the old, exhausted chain of iterators to release resources
currentContigReadsIterator.close();
// Advance to the FilePointer for the next contig
createNextContigFilePointer();
// We'll need to create a fresh iterator for this file pointer when we create the first
// shard for it below.
currentContigReadsIterator = null;
}
// At this point our currentContigReadsIterator may be null or non-null depending on whether or not
// this is our first shard for this file pointer.
if ( currentContigFilePointer != null ) {
Shard shard = new ReadShard(parser,readsDataSource, currentContigFilePointer.fileSpans, currentContigFilePointer.locations, currentContigFilePointer.isRegionUnmapped);
// Create a new reads iterator only when we've just advanced to the file pointer for the next
// contig. It's essential that the iterators persist across all shards that share the same contig
// to allow the downsampling to work properly.
if ( currentContigReadsIterator == null ) {
currentContigReadsIterator = new PeekableIterator<SAMRecord>(readsDataSource.getIterator(shard));
}
if ( currentContigReadsIterator.hasNext() ) {
shard.fill(currentContigReadsIterator);
nextShard = shard;
}
}
}
}
/**
* Aggregate all FilePointers for the next contig together into one monolithic FilePointer
* to avoid boundary issues with visiting the same file regions more than once (since more
* granular FilePointers will have regions that overlap with other nearby FilePointers due
* to the nature of BAM indices).
*
* By creating one persistent set of iterators per contig we also avoid boundary artifacts
* in the engine-level downsampling.
*
* TODO: This FilePointer aggregation should ideally be done at the BAMSchedule level for
* TODO: read traversals, as there's little point in the BAMSchedule emitting extremely
* TODO: granular FilePointers if we're just going to union them. The BAMSchedule should
* TODO: emit one FilePointer per contig for read traversals (but, crucially, NOT for
* TODO: locus traversals).
*/
private void createNextContigFilePointer() {
currentContigFilePointer = null;
List<FilePointer> nextContigFilePointers = new ArrayList<FilePointer>();
logger.info("Loading BAM index data for next contig");
while ( filePointers.hasNext() ) {
// Make sure that if we see a monolithic FilePointer (representing all regions in all files) that
// it is the ONLY FilePointer we ever encounter
if ( encounteredMonolithicFilePointer ) {
throw new ReviewedStingException("Bug: encountered additional FilePointers after encountering a monolithic FilePointer");
}
if ( filePointers.peek().isMonolithic() ) {
if ( totalFilePointersConsumed > 0 ) {
throw new ReviewedStingException("Bug: encountered additional FilePointers before encountering a monolithic FilePointer");
}
encounteredMonolithicFilePointer = true;
logger.debug(String.format("Encountered monolithic FilePointer: %s", filePointers.peek()));
}
// If this is the first FP we've seen, or we're dealing with mapped regions and the next FP is on the
// same contig as previous FPs, or all our FPs are unmapped, add the next FP to the list of FPs to merge
if ( nextContigFilePointers.isEmpty() ||
(! nextContigFilePointers.get(0).isRegionUnmapped && ! filePointers.peek().isRegionUnmapped &&
nextContigFilePointers.get(0).getContigIndex() == filePointers.peek().getContigIndex()) ||
(nextContigFilePointers.get(0).isRegionUnmapped && filePointers.peek().isRegionUnmapped) ) {
nextContigFilePointers.add(filePointers.next());
totalFilePointersConsumed++;
}
else {
break; // next FilePointer is on a different contig or has different mapped/unmapped status,
// save it for next time
}
}
if ( ! nextContigFilePointers.isEmpty() ) {
currentContigFilePointer = FilePointer.union(nextContigFilePointers, parser);
}
if ( currentContigFilePointer != null ) {
logger.info("Done loading BAM index data for next contig");
logger.debug(String.format("Next contig FilePointer: %s", currentContigFilePointer));
}
}
public void remove() {
throw new UnsupportedOperationException("Unable to remove from shard balancing iterator");
}
};
}
}

View File

@ -26,7 +26,9 @@ package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator; import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.GATKBAMFileSpan; import net.sf.samtools.GATKBAMFileSpan;
import net.sf.samtools.GATKChunk;
import net.sf.samtools.SAMFileSpan; import net.sf.samtools.SAMFileSpan;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.Utils;
@ -48,18 +50,87 @@ public class FilePointer {
*/ */
protected final boolean isRegionUnmapped; protected final boolean isRegionUnmapped;
public FilePointer(final GenomeLoc... locations) { /**
this.locations.addAll(Arrays.asList(locations)); * Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
* from more than one contig.
*/
private boolean isMonolithic = false;
/**
* Index of the contig covered by this FilePointer. Only meaningful for non-monolithic, mapped FilePointers
*/
private Integer contigIndex = null;
public FilePointer( List<GenomeLoc> locations ) {
this.locations.addAll(locations);
this.isRegionUnmapped = checkUnmappedStatus();
validateAllLocations();
if ( locations.size() > 0 ) {
contigIndex = locations.get(0).getContigIndex();
}
}
public FilePointer( final GenomeLoc... locations ) {
this(Arrays.asList(locations));
}
public FilePointer( Map<SAMReaderID,SAMFileSpan> fileSpans, List<GenomeLoc> locations ) {
this(locations);
this.fileSpans.putAll(fileSpans);
}
private boolean checkUnmappedStatus() {
boolean foundMapped = false, foundUnmapped = false; boolean foundMapped = false, foundUnmapped = false;
for(GenomeLoc location: locations) {
if(GenomeLoc.isUnmapped(location)) for( GenomeLoc location: locations ) {
if ( GenomeLoc.isUnmapped(location) )
foundUnmapped = true; foundUnmapped = true;
else else
foundMapped = true; foundMapped = true;
} }
if(foundMapped && foundUnmapped) if ( foundMapped && foundUnmapped )
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped."); throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
this.isRegionUnmapped = foundUnmapped;
return foundUnmapped;
}
private void validateAllLocations() {
// Unmapped and monolithic FilePointers are exempted from the one-contig-only restriction
if ( isRegionUnmapped || isMonolithic ) {
return;
}
Integer previousContigIndex = null;
for ( GenomeLoc location : locations ) {
if ( previousContigIndex != null && previousContigIndex != location.getContigIndex() ) {
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
}
previousContigIndex = location.getContigIndex();
}
}
private void validateLocation( GenomeLoc location ) {
if ( isRegionUnmapped != GenomeLoc.isUnmapped(location) ) {
throw new ReviewedStingException("BUG: File pointers cannot be mixed mapped/unmapped.");
}
if ( ! isRegionUnmapped && ! isMonolithic && contigIndex != null && contigIndex != location.getContigIndex() ) {
throw new ReviewedStingException("Non-monolithic file pointers must contain intervals from at most one contig");
}
}
/**
* Returns an immutable view of this FilePointer's file spans
*
* @return an immutable view of this FilePointer's file spans
*/
public Map<SAMReaderID, SAMFileSpan> getFileSpans() {
return Collections.unmodifiableMap(fileSpans);
} }
/** /**
@ -70,6 +141,39 @@ public class FilePointer {
return Collections.unmodifiableList(locations); return Collections.unmodifiableList(locations);
} }
/**
* Returns the index of the contig into which this FilePointer points (a FilePointer can represent
* regions in at most one contig).
*
* @return the index of the contig into which this FilePointer points
*/
public int getContigIndex() {
return locations.size() > 0 ? locations.get(0).getContigIndex() : SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX;
}
/**
* Is this FilePointer "monolithic"? That is, does it represent all regions in all files that we will
* ever visit during this GATK run? If this is set to true, the engine will expect to see only this
* one FilePointer during the entire run, and this FilePointer will be allowed to contain intervals
* from more than one contig.
*
* @return true if this FP is a monolithic FP representing all regions in all files, otherwise false
*/
public boolean isMonolithic() {
return isMonolithic;
}
/**
* Set this FP's "monolithic" status to true or false. An FP is monolithic if it represents all
* regions in all files that we will ever visit, and is the only FP we will ever create. A monolithic
* FP may contain intervals from more than one contig.
*
* @param isMonolithic set this FP's monolithic status to this value
*/
public void setIsMonolithic( boolean isMonolithic ) {
this.isMonolithic = isMonolithic;
}
@Override @Override
public boolean equals(final Object other) { public boolean equals(final Object other) {
if(!(other instanceof FilePointer)) if(!(other instanceof FilePointer))
@ -98,7 +202,12 @@ public class FilePointer {
} }
public void addLocation(final GenomeLoc location) { public void addLocation(final GenomeLoc location) {
locations.add(location); validateLocation(location);
this.locations.add(location);
if ( contigIndex == null ) {
contigIndex = location.getContigIndex();
}
} }
public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) { public void addFileSpans(final SAMReaderID id, final SAMFileSpan fileSpan) {
@ -216,6 +325,84 @@ public class FilePointer {
combined.addFileSpans(initialElement.getKey(),fileSpan); combined.addFileSpans(initialElement.getKey(),fileSpan);
} }
/**
* Efficiently generate the union of the n FilePointers passed in. Much more efficient than
* combining two FilePointers at a time using the combine() method above.
*
* IMPORTANT: the FilePointers to be unioned must either all represent regions on the
* same contig, or all be unmapped, since we cannot create FilePointers with a mix of
* contigs or with mixed mapped/unmapped regions.
*
* @param filePointers the FilePointers to union
* @param parser our GenomeLocParser
* @return the union of the FilePointers passed in
*/
public static FilePointer union( List<FilePointer> filePointers, GenomeLocParser parser ) {
if ( filePointers == null || filePointers.isEmpty() ) {
return new FilePointer();
}
Map<SAMReaderID, List<GATKChunk>> fileChunks = new HashMap<SAMReaderID, List<GATKChunk>>();
List<GenomeLoc> locations = new ArrayList<GenomeLoc>();
// First extract all intervals and file chunks from the FilePointers into unsorted, unmerged collections
for ( FilePointer filePointer : filePointers ) {
locations.addAll(filePointer.getLocations());
for ( Map.Entry<SAMReaderID, SAMFileSpan> fileSpanEntry : filePointer.getFileSpans().entrySet() ) {
GATKBAMFileSpan fileSpan = (GATKBAMFileSpan)fileSpanEntry.getValue();
if ( fileChunks.containsKey(fileSpanEntry.getKey()) ) {
fileChunks.get(fileSpanEntry.getKey()).addAll(fileSpan.getGATKChunks());
}
else {
fileChunks.put(fileSpanEntry.getKey(), fileSpan.getGATKChunks());
}
}
}
// Now sort and merge the intervals
List<GenomeLoc> sortedMergedLocations = new ArrayList<GenomeLoc>();
sortedMergedLocations.addAll(IntervalUtils.sortAndMergeIntervals(parser, locations, IntervalMergingRule.ALL));
// For each BAM file, convert from an unsorted, unmerged list of chunks to a GATKBAMFileSpan containing
// the sorted, merged union of the chunks for that file
Map<SAMReaderID, SAMFileSpan> mergedFileSpans = new HashMap<SAMReaderID, SAMFileSpan>(fileChunks.size());
for ( Map.Entry<SAMReaderID, List<GATKChunk>> fileChunksEntry : fileChunks.entrySet() ) {
List<GATKChunk> unmergedChunks = fileChunksEntry.getValue();
mergedFileSpans.put(fileChunksEntry.getKey(),
(new GATKBAMFileSpan(unmergedChunks.toArray(new GATKChunk[unmergedChunks.size()]))).union(new GATKBAMFileSpan()));
}
return new FilePointer(mergedFileSpans, sortedMergedLocations);
}
/**
* Returns true if any of the file spans in this FilePointer overlap their counterparts in
* the other FilePointer. "Overlap" is defined as having an overlapping extent (the region
* from the start of the first chunk to the end of the last chunk).
*
* @param other the FilePointer against which to check overlap with this FilePointer
* @return true if any file spans overlap their counterparts in other, otherwise false
*/
public boolean hasFileSpansOverlappingWith( FilePointer other ) {
for ( Map.Entry<SAMReaderID, SAMFileSpan> thisFilePointerEntry : fileSpans.entrySet() ) {
GATKBAMFileSpan thisFileSpan = new GATKBAMFileSpan(thisFilePointerEntry.getValue());
SAMFileSpan otherEntry = other.fileSpans.get(thisFilePointerEntry.getKey());
if ( otherEntry == null ) {
continue; // no counterpart for this file span in other
}
GATKBAMFileSpan otherFileSpan = new GATKBAMFileSpan(otherEntry);
if ( thisFileSpan.getExtent().overlaps(otherFileSpan.getExtent()) ) {
return true;
}
}
return false;
}
@Override @Override
public String toString() { public String toString() {
StringBuilder builder = new StringBuilder(); StringBuilder builder = new StringBuilder();

View File

@ -73,8 +73,15 @@ public class IntervalSharder implements Iterator<FilePointer> {
*/ */
public FilePointer next() { public FilePointer next() {
FilePointer current = wrappedIterator.next(); FilePointer current = wrappedIterator.next();
while(wrappedIterator.hasNext() && current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped && current.minus(wrappedIterator.peek()) == 0)
while ( wrappedIterator.hasNext() &&
current.isRegionUnmapped == wrappedIterator.peek().isRegionUnmapped &&
(current.getContigIndex() == wrappedIterator.peek().getContigIndex() || current.isRegionUnmapped) &&
current.minus(wrappedIterator.peek()) == 0 ) {
current = current.combine(parser,wrappedIterator.next()); current = current.combine(parser,wrappedIterator.next());
}
return current; return current;
} }

View File

@ -42,8 +42,10 @@ public class LocusShardBalancer extends ShardBalancer {
public Shard next() { public Shard next() {
FilePointer current = filePointers.next(); FilePointer current = filePointers.next();
while(filePointers.hasNext() && current.minus(filePointers.peek()) == 0)
current = current.combine(parser,filePointers.next()); // FilePointers have already been combined as necessary at the IntervalSharder level. No
// need to do so again here.
return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans); return new LocusShard(parser,readsDataSource,current.getLocations(),current.fileSpans);
} }

View File

@ -1,16 +1,15 @@
package org.broadinstitute.sting.gatk.datasources.reads; package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.samtools.SAMFileSpan; import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMRecord; import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; import org.broadinstitute.sting.gatk.iterators.StingSAMIterator;
import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter; import org.broadinstitute.sting.gatk.iterators.StingSAMIteratorAdapter;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.util.ArrayList; import java.util.*;
import java.util.Collection;
import java.util.List;
import java.util.Map;
/** /**
* *
@ -35,10 +34,21 @@ import java.util.Map;
* @version 0.1 * @version 0.1
*/ */
public class ReadShard extends Shard { public class ReadShard extends Shard {
/**
* Default read shard buffer size
*/
public static final int DEFAULT_MAX_READS = 10000;
/** /**
* What is the maximum number of reads per BAM file which should go into a read shard. * What is the maximum number of reads per BAM file which should go into a read shard.
*
* TODO: this non-final static variable should either be made final or turned into an
* TODO: instance variable somewhere -- as both static and mutable it wreaks havoc
* TODO: with tests that use multiple instances of SAMDataSource (since SAMDataSource
* TODO: changes this value)
*/ */
public static int MAX_READS = 10000; public static int MAX_READS = DEFAULT_MAX_READS;
/** /**
* The reads making up this shard. * The reads making up this shard.
@ -52,12 +62,24 @@ public class ReadShard extends Shard {
/** /**
* Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface * Sets the maximum number of reads buffered in a read shard. Implemented as a weirdly static interface
* until we know what effect tuning this parameter has. * until we know what effect tuning this parameter has.
*
* TODO: this mutable static interface is awful and breaks tests -- need to refactor
*
* @param bufferSize New maximum number * @param bufferSize New maximum number
*/ */
static void setReadBufferSize(final int bufferSize) { static void setReadBufferSize(final int bufferSize) {
MAX_READS = bufferSize; MAX_READS = bufferSize;
} }
/**
* What read buffer size are we using?
*
* @return
*/
public static int getReadBufferSize() {
return MAX_READS;
}
/** /**
* Returns true if this shard is meant to buffer reads, rather * Returns true if this shard is meant to buffer reads, rather
* than just holding pointers to their locations. * than just holding pointers to their locations.
@ -93,6 +115,67 @@ public class ReadShard extends Shard {
reads.add(read); reads.add(read);
} }
/**
* Fills this shard's buffer with reads from the iterator passed in
*
* @param readIter Iterator from which to draw the reads to fill the shard
*/
@Override
public void fill( PeekableIterator<SAMRecord> readIter ) {
if( ! buffersReads() )
throw new ReviewedStingException("Attempting to fill a non-buffering shard.");
SAMFileHeader.SortOrder sortOrder = getReadProperties().getSortOrder();
SAMRecord read = null;
while( ! isBufferFull() && readIter.hasNext() ) {
final SAMRecord nextRead = readIter.peek();
if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) {
// only add reads to the shard if they are on the same contig
read = readIter.next();
addRead(read);
} else {
break;
}
}
// If the reads are sorted in coordinate order, ensure that all reads
// having the same alignment start become part of the same shard, to allow
// downsampling to work better across shard boundaries. Note that because our
// read stream has already been fed through the positional downsampler, which
// ensures that at each alignment start position there are no more than dcov
// reads, we're in no danger of accidentally creating a disproportionately huge
// shard
if ( sortOrder == SAMFileHeader.SortOrder.coordinate ) {
while ( readIter.hasNext() ) {
SAMRecord additionalRead = readIter.peek();
// Stop filling the shard as soon as we encounter a read having a different
// alignment start or contig from the last read added in the earlier loop
// above, or an unmapped read
if ( read == null ||
additionalRead.getReadUnmappedFlag() ||
! additionalRead.getReferenceIndex().equals(read.getReferenceIndex()) ||
additionalRead.getAlignmentStart() != read.getAlignmentStart() ) {
break;
}
addRead(readIter.next());
}
}
// If the reads are sorted in queryname order, ensure that all reads
// having the same queryname become part of the same shard.
if( sortOrder == SAMFileHeader.SortOrder.queryname ) {
while( readIter.hasNext() ) {
SAMRecord nextRead = readIter.peek();
if( read == null || ! read.getReadName().equals(nextRead.getReadName()) )
break;
addRead(readIter.next());
}
}
}
/** /**
* Creates an iterator over reads stored in this shard's read cache. * Creates an iterator over reads stored in this shard's read cache.
* @return * @return
@ -116,4 +199,48 @@ public class ReadShard extends Shard {
} }
return sb.toString(); return sb.toString();
} }
/**
* Get the full span from the start of the left most read to the end of the right most one
*
* Note this may be different than the getLocation() of the shard, as this reflects the
* targeted span, not the actual span of reads
*
* @return the genome loc representing the span of these reads on the genome
*/
public GenomeLoc getReadsSpan() {
if ( isUnmapped() || super.getGenomeLocs() == null || reads.isEmpty() )
return super.getLocation();
else {
int start = Integer.MAX_VALUE;
int stop = Integer.MIN_VALUE;
String contig = null;
boolean foundMapped = false;
for ( final SAMRecord read : reads ) {
if ( contig != null && ! read.getReferenceName().equals(contig) )
throw new ReviewedStingException("ReadShard contains reads spanning contig boundaries, which is no longer allowed. "
+ "First contig is " + contig + " next read was " + read.getReferenceName() );
contig = read.getReferenceName();
// Even if this shard as a *whole* is not "unmapped", we can still encounter *individual* unmapped mates
// of mapped reads within this shard's buffer. In fact, if we're very unlucky with shard boundaries,
// this shard might consist *only* of unmapped mates! We need to refrain from using the alignment
// starts/stops of these unmapped mates, and detect the case where the shard has been filled *only*
// with unmapped mates.
if ( ! read.getReadUnmappedFlag() ) {
foundMapped = true;
if ( read.getAlignmentStart() < start ) start = read.getAlignmentStart();
if ( read.getAlignmentEnd() > stop ) stop = read.getAlignmentEnd();
}
}
assert contig != null;
if ( ! foundMapped || contig.equals("*") ) // all reads are unmapped
return GenomeLoc.UNMAPPED;
else
return parser.createGenomeLoc(contig, start, stop);
}
}
} }

View File

@ -34,6 +34,8 @@ import java.util.NoSuchElementException;
/** /**
* Divide up large file pointers containing reads into more manageable subcomponents. * Divide up large file pointers containing reads into more manageable subcomponents.
*
* TODO: delete this class once the experimental downsampling engine fork collapses
*/ */
public class ReadShardBalancer extends ShardBalancer { public class ReadShardBalancer extends ShardBalancer {
/** /**

View File

@ -24,14 +24,15 @@
package org.broadinstitute.sting.gatk.datasources.reads; package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.reference.IndexedFastaSequenceFile;
import net.sf.picard.sam.MergingSamRecordIterator; import net.sf.picard.sam.MergingSamRecordIterator;
import net.sf.picard.sam.SamFileHeaderMerger; import net.sf.picard.sam.SamFileHeaderMerger;
import net.sf.samtools.*; import net.sf.samtools.*;
import net.sf.samtools.util.CloseableIterator; import net.sf.samtools.util.CloseableIterator;
import net.sf.samtools.util.RuntimeIOException; import net.sf.samtools.util.RuntimeIOException;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.broadinstitute.sting.gatk.DownsamplingMethod; import org.broadinstitute.sting.gatk.downsampling.*;
import org.broadinstitute.sting.gatk.downsampling.DownsampleType;
import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod;
import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.ReadMetrics;
import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.ReadProperties;
import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion;
@ -42,12 +43,9 @@ import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.GenomeLocSortedSet;
import org.broadinstitute.sting.utils.SimpleTimer; import org.broadinstitute.sting.utils.SimpleTimer;
import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.baq.ReadTransformingIterator;
import org.broadinstitute.sting.utils.baq.BAQSamIterator;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.exceptions.UserException;
import org.broadinstitute.sting.utils.recalibration.BQSRSamIterator;
import org.broadinstitute.sting.utils.recalibration.BaseRecalibration;
import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory; import org.broadinstitute.sting.utils.sam.GATKSamRecordFactory;
import java.io.File; import java.io.File;
@ -101,6 +99,8 @@ public class SAMDataSource {
/** /**
* How far along is each reader? * How far along is each reader?
*
* TODO: delete this once the experimental downsampling engine fork collapses
*/ */
private final Map<SAMReaderID,GATKBAMFileSpan> readerPositions = new HashMap<SAMReaderID,GATKBAMFileSpan>(); private final Map<SAMReaderID,GATKBAMFileSpan> readerPositions = new HashMap<SAMReaderID,GATKBAMFileSpan>();
@ -200,11 +200,8 @@ public class SAMDataSource {
downsamplingMethod, downsamplingMethod,
exclusionList, exclusionList,
supplementalFilters, supplementalFilters,
Collections.<ReadTransformer>emptyList(),
includeReadsWithDeletionAtLoci, includeReadsWithDeletionAtLoci,
BAQ.CalculationMode.OFF,
BAQ.QualityMode.DONT_MODIFY,
null, // no BAQ
null, // no BQSR
(byte) -1, (byte) -1,
false); false);
} }
@ -234,11 +231,8 @@ public class SAMDataSource {
DownsamplingMethod downsamplingMethod, DownsamplingMethod downsamplingMethod,
ValidationExclusion exclusionList, ValidationExclusion exclusionList,
Collection<ReadFilter> supplementalFilters, Collection<ReadFilter> supplementalFilters,
List<ReadTransformer> readTransformers,
boolean includeReadsWithDeletionAtLoci, boolean includeReadsWithDeletionAtLoci,
BAQ.CalculationMode cmode,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
byte defaultBaseQualities, byte defaultBaseQualities,
boolean removeProgramRecords) { boolean removeProgramRecords) {
this.readMetrics = new ReadMetrics(); this.readMetrics = new ReadMetrics();
@ -258,11 +252,11 @@ public class SAMDataSource {
validationStringency = strictness; validationStringency = strictness;
this.removeProgramRecords = removeProgramRecords; this.removeProgramRecords = removeProgramRecords;
if(readBufferSize != null) if(readBufferSize != null)
ReadShard.setReadBufferSize(readBufferSize); ReadShard.setReadBufferSize(readBufferSize); // TODO: use of non-final static variable here is just awful, especially for parallel tests
else { else {
// Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively // Choose a sensible default for the read buffer size. For the moment, we're picking 1000 reads per BAM per shard (which effectively
// will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once. // will mean per-thread once ReadWalkers are parallelized) with a max cap of 250K reads in memory at once.
ReadShard.setReadBufferSize(Math.min(1000*samFiles.size(),250000)); ReadShard.setReadBufferSize(Math.min(10000*samFiles.size(),250000));
} }
resourcePool = new SAMResourcePool(Integer.MAX_VALUE); resourcePool = new SAMResourcePool(Integer.MAX_VALUE);
@ -303,16 +297,14 @@ public class SAMDataSource {
readProperties = new ReadProperties( readProperties = new ReadProperties(
samFiles, samFiles,
mergedHeader, mergedHeader,
sortOrder,
useOriginalBaseQualities, useOriginalBaseQualities,
strictness, strictness,
downsamplingMethod, downsamplingMethod,
exclusionList, exclusionList,
supplementalFilters, supplementalFilters,
readTransformers,
includeReadsWithDeletionAtLoci, includeReadsWithDeletionAtLoci,
cmode,
qmode,
refReader,
bqsrApplier,
defaultBaseQualities); defaultBaseQualities);
// cache the read group id (original) -> read group id (merged) // cache the read group id (original) -> read group id (merged)
@ -388,7 +380,10 @@ public class SAMDataSource {
/** /**
* Retrieves the current position within the BAM file. * Retrieves the current position within the BAM file.
* @return A mapping of reader to current position. * @return A mapping of reader to current position.
*
* TODO: delete this once the experimental downsampling engine fork collapses
*/ */
@Deprecated
public Map<SAMReaderID,GATKBAMFileSpan> getCurrentPosition() { public Map<SAMReaderID,GATKBAMFileSpan> getCurrentPosition() {
return readerPositions; return readerPositions;
} }
@ -471,9 +466,15 @@ public class SAMDataSource {
} }
/** /**
* Fill the given buffering shard with reads. * Legacy method to fill the given buffering shard with reads.
*
* Shard.fill() is used instead of this method when experimental downsampling is enabled
*
* TODO: delete this method once the experimental downsampling engine fork collapses
*
* @param shard Shard to fill. * @param shard Shard to fill.
*/ */
@Deprecated
public void fillShard(Shard shard) { public void fillShard(Shard shard) {
if(!shard.buffersReads()) if(!shard.buffersReads())
throw new ReviewedStingException("Attempting to fill a non-buffering shard."); throw new ReviewedStingException("Attempting to fill a non-buffering shard.");
@ -486,9 +487,15 @@ public class SAMDataSource {
CloseableIterator<SAMRecord> iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate); CloseableIterator<SAMRecord> iterator = getIterator(readers,shard,sortOrder == SAMFileHeader.SortOrder.coordinate);
while(!shard.isBufferFull() && iterator.hasNext()) { while(!shard.isBufferFull() && iterator.hasNext()) {
read = iterator.next(); final SAMRecord nextRead = iterator.next();
shard.addRead(read); if ( read == null || (nextRead.getReferenceIndex().equals(read.getReferenceIndex())) ) {
noteFilePositionUpdate(positionUpdates,read); // only add reads to the shard if they are on the same contig
read = nextRead;
shard.addRead(read);
noteFilePositionUpdate(positionUpdates,read);
} else {
break;
}
} }
// If the reads are sorted in queryname order, ensure that all reads // If the reads are sorted in queryname order, ensure that all reads
@ -510,6 +517,10 @@ public class SAMDataSource {
readerPositions.put(readers.getReaderID(positionUpdate.getKey()),positionUpdate.getValue()); readerPositions.put(readers.getReaderID(positionUpdate.getKey()),positionUpdate.getValue());
} }
/*
* TODO: delete this method once the experimental downsampling engine fork collapses
*/
@Deprecated
private void noteFilePositionUpdate(Map<SAMFileReader,GATKBAMFileSpan> positionMapping, SAMRecord read) { private void noteFilePositionUpdate(Map<SAMFileReader,GATKBAMFileSpan> positionMapping, SAMRecord read) {
GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing()); GATKBAMFileSpan endChunk = new GATKBAMFileSpan(read.getFileSource().getFilePointer().getContentsFollowing());
positionMapping.put(read.getFileSource().getReader(),endChunk); positionMapping.put(read.getFileSource().getReader(),endChunk);
@ -520,8 +531,7 @@ public class SAMDataSource {
return shard.iterator(); return shard.iterator();
} }
else { else {
SAMReaders readers = resourcePool.getAvailableReaders(); return getIterator(shard);
return getIterator(readers,shard,shard instanceof ReadShard);
} }
} }
@ -541,13 +551,44 @@ public class SAMDataSource {
/** /**
* Initialize the current reader positions * Initialize the current reader positions
*
* TODO: delete this once the experimental downsampling engine fork collapses
*
* @param readers * @param readers
*/ */
@Deprecated
private void initializeReaderPositions(SAMReaders readers) { private void initializeReaderPositions(SAMReaders readers) {
for(SAMReaderID id: getReaderIDs()) for(SAMReaderID id: getReaderIDs())
readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads())); readerPositions.put(id,new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
} }
/**
* Get the initial reader positions across all BAM files
*
* @return the start positions of the first chunk of reads for all BAM files
*/
public Map<SAMReaderID, GATKBAMFileSpan> getInitialReaderPositions() {
Map<SAMReaderID, GATKBAMFileSpan> initialPositions = new HashMap<SAMReaderID, GATKBAMFileSpan>();
SAMReaders readers = resourcePool.getAvailableReaders();
for ( SAMReaderID id: getReaderIDs() ) {
initialPositions.put(id, new GATKBAMFileSpan(readers.getReader(id).getFilePointerSpanningReads()));
}
resourcePool.releaseReaders(readers);
return initialPositions;
}
/**
* Get an iterator over the data types specified in the shard.
*
* @param shard The shard specifying the data limits.
* @return An iterator over the selected data.
*/
public StingSAMIterator getIterator( Shard shard ) {
return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard);
}
/** /**
* Get an iterator over the data types specified in the shard. * Get an iterator over the data types specified in the shard.
* @param readers Readers from which to load data. * @param readers Readers from which to load data.
@ -585,6 +626,7 @@ public class SAMDataSource {
iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator); iterator = new MalformedBAMErrorReformatingIterator(id.samFile, iterator);
if(shard.getGenomeLocs().size() > 0) if(shard.getGenomeLocs().size() > 0)
iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs()); iterator = new IntervalOverlapFilteringIterator(iterator,shard.getGenomeLocs());
iteratorMap.put(readers.getReader(id), iterator); iteratorMap.put(readers.getReader(id), iterator);
} }
@ -597,10 +639,7 @@ public class SAMDataSource {
readProperties.getDownsamplingMethod().toFraction, readProperties.getDownsamplingMethod().toFraction,
readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION), readProperties.getValidationExclusionList().contains(ValidationExclusion.TYPE.NO_READ_ORDER_VERIFICATION),
readProperties.getSupplementalFilters(), readProperties.getSupplementalFilters(),
readProperties.getBAQCalculationMode(), readProperties.getReadTransformers(),
readProperties.getBAQQualityMode(),
readProperties.getRefReader(),
readProperties.getBQSRApplier(),
readProperties.defaultBaseQualities()); readProperties.defaultBaseQualities());
} }
@ -667,40 +706,62 @@ public class SAMDataSource {
Double downsamplingFraction, Double downsamplingFraction,
Boolean noValidationOfReadOrder, Boolean noValidationOfReadOrder,
Collection<ReadFilter> supplementalFilters, Collection<ReadFilter> supplementalFilters,
BAQ.CalculationMode cmode, List<ReadTransformer> readTransformers,
BAQ.QualityMode qmode,
IndexedFastaSequenceFile refReader,
BaseRecalibration bqsrApplier,
byte defaultBaseQualities) { byte defaultBaseQualities) {
// *********************************************************************************** // // ************************************************************************************************ //
// * NOTE: ALL FILTERING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * // // * NOTE: ALL FILTERING/DOWNSAMPLING SHOULD BE DONE BEFORE ANY ITERATORS THAT MODIFY THE READS! * //
// * (otherwise we will process something that we may end up throwing away) * // // * (otherwise we will process something that we may end up throwing away) * //
// *********************************************************************************** // // ************************************************************************************************ //
if (downsamplingFraction != null) wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
wrappedIterator = new DownsampleIterator(wrappedIterator, downsamplingFraction);
if ( readProperties.getDownsamplingMethod().useExperimentalDownsampling ) {
wrappedIterator = applyDownsamplingIterator(wrappedIterator);
}
// Use the old fractional downsampler only if we're not using experimental downsampling:
if ( ! readProperties.getDownsamplingMethod().useExperimentalDownsampling && downsamplingFraction != null )
wrappedIterator = new LegacyDownsampleIterator(wrappedIterator, downsamplingFraction);
// unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification, // unless they've said not to validate read ordering (!noValidationOfReadOrder) and we've enabled verification,
// verify the read ordering by applying a sort order iterator // verify the read ordering by applying a sort order iterator
if (!noValidationOfReadOrder && enableVerification) if (!noValidationOfReadOrder && enableVerification)
wrappedIterator = new VerifyingSamIterator(genomeLocParser,wrappedIterator); wrappedIterator = new VerifyingSamIterator(wrappedIterator);
wrappedIterator = StingSAMIteratorAdapter.adapt(new CountingFilteringIterator(readMetrics,wrappedIterator,supplementalFilters));
if (useOriginalBaseQualities || defaultBaseQualities >= 0) if (useOriginalBaseQualities || defaultBaseQualities >= 0)
// only wrap if we are replacing the original qualities or using a default base quality // only wrap if we are replacing the original qualities or using a default base quality
wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities); wrappedIterator = new ReadFormattingIterator(wrappedIterator, useOriginalBaseQualities, defaultBaseQualities);
if (bqsrApplier != null) // set up read transformers
wrappedIterator = new BQSRSamIterator(wrappedIterator, bqsrApplier); for ( final ReadTransformer readTransformer : readTransformers ) {
if ( readTransformer.enabled() && readTransformer.getApplicationTime() == ReadTransformer.ApplicationTime.ON_INPUT )
if (cmode != BAQ.CalculationMode.OFF) wrappedIterator = new ReadTransformingIterator(wrappedIterator, readTransformer);
wrappedIterator = new BAQSamIterator(refReader, wrappedIterator, cmode, qmode); }
return wrappedIterator; return wrappedIterator;
} }
protected StingSAMIterator applyDownsamplingIterator( StingSAMIterator wrappedIterator ) {
if ( readProperties.getDownsamplingMethod().type == DownsampleType.BY_SAMPLE ) {
ReadsDownsamplerFactory<SAMRecord> downsamplerFactory = readProperties.getDownsamplingMethod().toCoverage != null ?
new SimplePositionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
new FractionalDownsamplerFactory<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
return new PerSampleDownsamplingReadsIterator(wrappedIterator, downsamplerFactory);
}
else if ( readProperties.getDownsamplingMethod().type == DownsampleType.ALL_READS ) {
ReadsDownsampler<SAMRecord> downsampler = readProperties.getDownsamplingMethod().toCoverage != null ?
new SimplePositionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toCoverage) :
new FractionalDownsampler<SAMRecord>(readProperties.getDownsamplingMethod().toFraction);
return new DownsamplingReadsIterator(wrappedIterator, downsampler);
}
return wrappedIterator;
}
private class SAMResourcePool { private class SAMResourcePool {
/** /**
* How many entries can be cached in this resource pool? * How many entries can be cached in this resource pool?
@ -947,6 +1008,12 @@ public class SAMDataSource {
} catch ( SAMFormatException e ) { } catch ( SAMFormatException e ) {
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage()); throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
} }
// Picard is throwing a RuntimeException here when BAMs are malformed with bad headers (and so look like SAM files).
// Let's keep this separate from the SAMFormatException (which ultimately derives from RuntimeException) case,
// just in case we want to change this behavior later.
catch ( RuntimeException e ) {
throw new UserException.MalformedBAM(readerID.samFile, e.getMessage());
}
reader.setSAMRecordFactory(factory); reader.setSAMRecordFactory(factory);
reader.enableFileSource(true); reader.enableFileSource(true);
reader.setValidationStringency(validationStringency); reader.setValidationStringency(validationStringency);

View File

@ -1,5 +1,6 @@
package org.broadinstitute.sting.gatk.datasources.reads; package org.broadinstitute.sting.gatk.datasources.reads;
import net.sf.picard.util.PeekableIterator;
import net.sf.samtools.SAMFileSpan; import net.sf.samtools.SAMFileSpan;
import net.sf.samtools.SAMRecord; import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.ReadMetrics;
@ -203,6 +204,12 @@ public abstract class Shard implements HasGenomeLocation {
*/ */
public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); } public void addRead(SAMRecord read) { throw new UnsupportedOperationException("This shard does not buffer reads."); }
/**
* Fills the shard with reads. Can only do this with shards that buffer reads
* @param readIter Iterator from which to draw the reads to fill the shard
*/
public void fill( PeekableIterator<SAMRecord> readIter ) { throw new UnsupportedOperationException("This shard does not buffer reads."); }
/** /**
* Gets the iterator over the elements cached in the shard. * Gets the iterator over the elements cached in the shard.
* @return * @return

View File

@ -34,8 +34,10 @@ import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet;
import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File; import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.lang.reflect.Type; import java.lang.reflect.Type;
import java.util.List; import java.util.List;
@ -239,6 +241,8 @@ class ReferenceOrderedQueryDataPool extends ResourcePool<RMDTrack,LocationAwareS
} else { } else {
return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.getIterator()); return new SeekableRODIterator(header,sequenceDictionary,referenceSequenceDictionary,genomeLocParser,track.getIterator());
} }
} catch (FileNotFoundException e) {
throw new UserException.CouldNotReadInputFile(fileDescriptor.getName(), "it could not be found");
} catch (IOException e) { } catch (IOException e) {
throw new ReviewedStingException("Unable to create iterator for rod named " + fileDescriptor.getName(),e); throw new ReviewedStingException("Unable to create iterator for rod named " + fileDescriptor.getName(),e);
} }

View File

@ -1,4 +1,4 @@
package org.broadinstitute.sting.gatk; package org.broadinstitute.sting.gatk.downsampling;
/** /**
* Type of downsampling method to invoke. * Type of downsampling method to invoke.

View File

@ -28,49 +28,92 @@ import java.util.Collection;
import java.util.List; import java.util.List;
/** /**
* The basic downsampler API, with no reads-specific operations * The basic downsampler API, with no reads-specific operations.
*
* Downsamplers that extend this interface rather than the ReadsDownsampler interface can handle
* any kind of item, however they cannot be wrapped within a DownsamplingReadsIterator or a
* PerSampleDownsamplingReadsIterator.
* *
* @author David Roazen * @author David Roazen
*/ */
public interface Downsampler<T> { public interface Downsampler<T> {
/* /**
* Submit one item to the downsampler for consideration . Some downsamplers will be able to determine * Submit one item to the downsampler for consideration. Some downsamplers will be able to determine
* immediately whether the item survives the downsampling process, while others will need to see * immediately whether the item survives the downsampling process, while others will need to see
* more items before making that determination. * more items before making that determination.
*
* @param item the individual item to submit to the downsampler for consideration
*/ */
public void submit( T item ); public void submit( T item );
/* /**
* Submit a collection of items to the downsampler for consideration. * Submit a collection of items to the downsampler for consideration. Should be equivalent to calling
* submit() on each individual item in the collection.
*
* @param items the collection of items to submit to the downsampler for consideration
*/ */
public void submit( Collection<T> items ); public void submit( Collection<T> items );
/* /**
* Are there items that have survived the downsampling process waiting to be retrieved? * Are there items that have survived the downsampling process waiting to be retrieved?
*
* @return true if this downsampler has > 0 finalized items, otherwise false
*/ */
public boolean hasDownsampledItems(); public boolean hasFinalizedItems();
/* /**
* Return (and remove) all items that have survived downsampling and are waiting to be retrieved. * Return (and *remove*) all items that have survived downsampling and are waiting to be retrieved.
*
* @return a list of all finalized items this downsampler contains, or an empty list if there are none
*/ */
public List<T> consumeDownsampledItems(); public List<T> consumeFinalizedItems();
/* /**
* Are there items stored in this downsampler that it doesn't yet know whether they will * Are there items stored in this downsampler that it doesn't yet know whether they will
* ultimately survive the downsampling process? * ultimately survive the downsampling process?
*
* @return true if this downsampler has > 0 pending items, otherwise false
*/ */
public boolean hasPendingItems(); public boolean hasPendingItems();
/* /**
* Peek at the first finalized item stored in this downsampler (or null if there are no finalized items)
*
* @return the first finalized item in this downsampler (the item is not removed from the downsampler by this call),
* or null if there are none
*/
public T peekFinalized();
/**
* Peek at the first pending item stored in this downsampler (or null if there are no pending items)
*
* @return the first pending item stored in this downsampler (the item is not removed from the downsampler by this call),
* or null if there are none
*/
public T peekPending();
/**
* Returns the number of items discarded (so far) during the downsampling process
*
* @return the number of items that have been submitted to this downsampler and discarded in the process of
* downsampling
*/
public int getNumberOfDiscardedItems();
/**
* Used to tell the downsampler that no more items will be submitted to it, and that it should * Used to tell the downsampler that no more items will be submitted to it, and that it should
* finalize any pending items. * finalize any pending items.
*/ */
public void signalEndOfInput(); public void signalEndOfInput();
/* /**
* Reset the downsampler to a clean state, devoid of any pending/downsampled items or tracked state * Empty the downsampler of all finalized/pending items
* information.
*/ */
public void clear(); public void clear();
/**
* Reset stats in the downsampler such as the number of discarded items *without* clearing the downsampler of items
*/
public void reset();
} }

View File

@ -0,0 +1,153 @@
/*
* Copyright (c) 2012, The Broad Institute
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
package org.broadinstitute.sting.gatk.downsampling;
import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker;
import org.broadinstitute.sting.gatk.walkers.LocusWalker;
import org.broadinstitute.sting.gatk.walkers.Walker;
import org.broadinstitute.sting.utils.exceptions.UserException;
/**
* Describes the method for downsampling reads at a given locus.
*/
public class DownsamplingMethod {
/**
* Type of downsampling to perform.
*/
public final DownsampleType type;
/**
* Actual downsampling target is specified as an integer number of reads.
*/
public final Integer toCoverage;
/**
* Actual downsampling target is specified as a fraction of total available reads.
*/
public final Double toFraction;
/**
* Use the new experimental downsampling?
*/
public final boolean useExperimentalDownsampling;
/**
* Expresses no downsampling applied at all.
*/
public static final DownsamplingMethod NONE = new DownsamplingMethod(DownsampleType.NONE,null,null,false);
/**
* Default type to use if no type is specified
*/
public static DownsampleType DEFAULT_DOWNSAMPLING_TYPE = DownsampleType.BY_SAMPLE;
/**
* Default target coverage for locus-based traversals
*/
public static int DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE = 1000;
public DownsamplingMethod( DownsampleType type, Integer toCoverage, Double toFraction, boolean useExperimentalDownsampling ) {
this.type = type != null ? type : DEFAULT_DOWNSAMPLING_TYPE;
this.toCoverage = toCoverage;
this.toFraction = toFraction;
this.useExperimentalDownsampling = useExperimentalDownsampling;
if ( type == DownsampleType.NONE ) {
toCoverage = null;
toFraction = null;
}
validate();
}
private void validate() {
// Can't leave toFraction and toCoverage null unless type is NONE
if ( type != DownsampleType.NONE && toFraction == null && toCoverage == null )
throw new UserException.CommandLineException("Must specify either toFraction or toCoverage when downsampling.");
// Fraction and coverage cannot both be specified.
if ( toFraction != null && toCoverage != null )
throw new UserException.CommandLineException("Downsampling coverage and fraction are both specified. Please choose only one.");
// toCoverage must be > 0 when specified
if ( toCoverage != null && toCoverage <= 0 ) {
throw new UserException.CommandLineException("toCoverage must be > 0 when downsampling to coverage");
}
// toFraction must be >= 0.0 and <= 1.0 when specified
if ( toFraction != null && (toFraction < 0.0 || toFraction > 1.0) ) {
throw new UserException.CommandLineException("toFraction must be >= 0.0 and <= 1.0 when downsampling to a fraction of reads");
}
// Some restrictions only exist for the old downsampling implementation:
if ( ! useExperimentalDownsampling ) {
// By sample downsampling does not work with a fraction of reads in the old downsampling implementation
if( type == DownsampleType.BY_SAMPLE && toFraction != null )
throw new UserException.CommandLineException("Cannot downsample to fraction with the BY_SAMPLE method");
}
// Some restrictions only exist for the new downsampling implementation:
if ( useExperimentalDownsampling ) {
if ( type == DownsampleType.ALL_READS && toCoverage != null ) {
throw new UserException.CommandLineException("Cannot downsample to coverage with the ALL_READS method in the experimental downsampling implementation");
}
}
}
public String toString() {
StringBuilder builder = new StringBuilder("Downsampling Settings: ");
if ( type == DownsampleType.NONE ) {
builder.append("No downsampling");
}
else {
builder.append(String.format("Method: %s ", type));
if ( toCoverage != null ) {
builder.append(String.format("Target Coverage: %d ", toCoverage));
}
else {
builder.append(String.format("Target Fraction: %.2f ", toFraction));
}
if ( useExperimentalDownsampling ) {
builder.append("Using Experimental Downsampling");
}
}
return builder.toString();
}
public static DownsamplingMethod getDefaultDownsamplingMethod( Walker walker, boolean useExperimentalDownsampling ) {
if ( walker instanceof LocusWalker || walker instanceof ActiveRegionWalker ) {
return new DownsamplingMethod(DEFAULT_DOWNSAMPLING_TYPE, DEFAULT_LOCUS_BASED_TRAVERSAL_DOWNSAMPLING_COVERAGE,
null, useExperimentalDownsampling);
}
else {
return new DownsamplingMethod(DownsampleType.NONE, null, null, useExperimentalDownsampling);
}
}
}

View File

@ -33,7 +33,8 @@ import java.util.NoSuchElementException;
/** /**
* StingSAMIterator wrapper around our generic reads downsampler interface * StingSAMIterator wrapper around our generic reads downsampler interface. Converts the push-style
* downsampler interface to a pull model.
* *
* @author David Roazen * @author David Roazen
*/ */
@ -42,35 +43,50 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
private StingSAMIterator nestedSAMIterator; private StingSAMIterator nestedSAMIterator;
private ReadsDownsampler<SAMRecord> downsampler; private ReadsDownsampler<SAMRecord> downsampler;
private Collection<SAMRecord> downsampledReadsCache; private Collection<SAMRecord> downsampledReadsCache;
private Iterator<SAMRecord> downsampledReadsCacheIterator; private SAMRecord nextRead = null;
private Iterator<SAMRecord> downsampledReadsCacheIterator = null;
/**
* @param iter wrapped iterator from which this iterator will pull reads
* @param downsampler downsampler through which the reads will be fed
*/
public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) { public DownsamplingReadsIterator( StingSAMIterator iter, ReadsDownsampler<SAMRecord> downsampler ) {
nestedSAMIterator = iter; nestedSAMIterator = iter;
this.downsampler = downsampler; this.downsampler = downsampler;
fillDownsampledReadsCache();
advanceToNextRead();
} }
public boolean hasNext() { public boolean hasNext() {
if ( downsampledReadsCacheIterator.hasNext() ) { return nextRead != null;
return true;
}
else if ( ! nestedSAMIterator.hasNext() || ! fillDownsampledReadsCache() ) {
return false;
}
return true;
} }
public SAMRecord next() { public SAMRecord next() {
if ( ! downsampledReadsCacheIterator.hasNext() && ! fillDownsampledReadsCache() ) { if ( nextRead == null ) {
throw new NoSuchElementException("next() called when there are no more items"); throw new NoSuchElementException("next() called when there are no more items");
} }
return downsampledReadsCacheIterator.next(); SAMRecord toReturn = nextRead;
advanceToNextRead();
return toReturn;
}
private void advanceToNextRead() {
if ( ! readyToReleaseReads() && ! fillDownsampledReadsCache() ) {
nextRead = null;
}
else {
nextRead = downsampledReadsCacheIterator.next();
}
}
private boolean readyToReleaseReads() {
return downsampledReadsCacheIterator != null && downsampledReadsCacheIterator.hasNext();
} }
private boolean fillDownsampledReadsCache() { private boolean fillDownsampledReadsCache() {
while ( nestedSAMIterator.hasNext() && ! downsampler.hasDownsampledItems() ) { while ( nestedSAMIterator.hasNext() && ! downsampler.hasFinalizedItems() ) {
downsampler.submit(nestedSAMIterator.next()); downsampler.submit(nestedSAMIterator.next());
} }
@ -78,7 +94,8 @@ public class DownsamplingReadsIterator implements StingSAMIterator {
downsampler.signalEndOfInput(); downsampler.signalEndOfInput();
} }
downsampledReadsCache = downsampler.consumeDownsampledItems(); // use returned collection directly rather than make a copy, for speed
downsampledReadsCache = downsampler.consumeFinalizedItems();
downsampledReadsCacheIterator = downsampledReadsCache.iterator(); downsampledReadsCacheIterator = downsampledReadsCache.iterator();
return downsampledReadsCacheIterator.hasNext(); return downsampledReadsCacheIterator.hasNext();

View File

@ -33,7 +33,10 @@ import java.util.Collection;
import java.util.List; import java.util.List;
/** /**
* Fractional Downsampler: selects a specified fraction of the reads for inclusion * Fractional Downsampler: selects a specified fraction of the reads for inclusion.
*
* Since the selection is done randomly, the actual fraction of reads retained may be slightly
* more or less than the requested fraction, depending on the total number of reads submitted.
* *
* @author David Roazen * @author David Roazen
*/ */
@ -43,8 +46,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
private int cutoffForInclusion; private int cutoffForInclusion;
private int numDiscardedItems;
private static final int RANDOM_POOL_SIZE = 10000; private static final int RANDOM_POOL_SIZE = 10000;
/**
* Construct a FractionalDownsampler
*
* @param fraction Fraction of reads to preserve, between 0.0 (inclusive) and 1.0 (inclusive).
* Actual number of reads preserved may differ randomly.
*/
public FractionalDownsampler( double fraction ) { public FractionalDownsampler( double fraction ) {
if ( fraction < 0.0 || fraction > 1.0 ) { if ( fraction < 0.0 || fraction > 1.0 ) {
throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive"); throw new ReviewedStingException("Fraction of reads to include must be between 0.0 and 1.0, inclusive");
@ -52,12 +63,16 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE); cutoffForInclusion = (int)(fraction * RANDOM_POOL_SIZE);
clear(); clear();
reset();
} }
public void submit( T newRead ) { public void submit( T newRead ) {
if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) { if ( GenomeAnalysisEngine.getRandomGenerator().nextInt(10000) < cutoffForInclusion ) {
selectedReads.add(newRead); selectedReads.add(newRead);
} }
else {
numDiscardedItems++;
}
} }
public void submit( Collection<T> newReads ) { public void submit( Collection<T> newReads ) {
@ -66,11 +81,12 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
} }
} }
public boolean hasDownsampledItems() { public boolean hasFinalizedItems() {
return selectedReads.size() > 0; return selectedReads.size() > 0;
} }
public List<T> consumeDownsampledItems() { public List<T> consumeFinalizedItems() {
// pass by reference rather than make a copy, for speed
List<T> downsampledItems = selectedReads; List<T> downsampledItems = selectedReads;
clear(); clear();
return downsampledItems; return downsampledItems;
@ -80,6 +96,18 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
return false; return false;
} }
public T peekFinalized() {
return selectedReads.isEmpty() ? null : selectedReads.get(0);
}
public T peekPending() {
return null;
}
public int getNumberOfDiscardedItems() {
return numDiscardedItems;
}
public void signalEndOfInput() { public void signalEndOfInput() {
// NO-OP // NO-OP
} }
@ -88,7 +116,15 @@ public class FractionalDownsampler<T extends SAMRecord> implements ReadsDownsamp
selectedReads = new ArrayList<T>(); selectedReads = new ArrayList<T>();
} }
public void reset() {
numDiscardedItems = 0;
}
public boolean requiresCoordinateSortOrder() { public boolean requiresCoordinateSortOrder() {
return false; return false;
} }
public void signalNoMoreReadsBefore( T read ) {
// NO-OP
}
} }

Some files were not shown because too many files have changed in this diff Show More